From cd4991b10435545193e3769786111638d671506e Mon Sep 17 00:00:00 2001 From: Sanal Date: Wed, 25 Sep 2024 15:17:34 -0700 Subject: [PATCH 001/130] Add support for raft repl dev replace member. (#546) When replacing a member, add the new member, sync raft log for replace and finally remove the old member. Once we add new member, baseline or incremental resync will start. Remove the old member will cause nuraft mesg to exit the group and we periodically gc the destroyed group. Made the repl dev base test common so that both tests files can use. Tests by default create repl group with num_replica's. Dynamic tests create additional spare replica's which can be added to the test dynamically by calling replace member. --- .../homestore/replication/repl_decls.h | 19 +- src/include/homestore/replication/repl_dev.h | 6 +- src/lib/replication/repl_dev/common.cpp | 3 +- .../replication/repl_dev/raft_repl_dev.cpp | 109 ++- src/lib/replication/repl_dev/raft_repl_dev.h | 7 + .../replication/service/raft_repl_service.cpp | 16 +- .../replication/service/raft_repl_service.h | 1 - src/tests/CMakeLists.txt | 6 + src/tests/test_common/hs_repl_test_common.hpp | 19 +- src/tests/test_common/raft_repl_test_base.hpp | 629 ++++++++++++++++++ src/tests/test_raft_repl_dev.cpp | 605 +---------------- src/tests/test_raft_repl_dev_dynamic.cpp | 133 ++++ src/tests/test_solo_repl_dev.cpp | 1 + 13 files changed, 929 insertions(+), 625 deletions(-) create mode 100644 src/tests/test_common/raft_repl_test_base.hpp create mode 100644 src/tests/test_raft_repl_dev_dynamic.cpp diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 994da7d97..558c19517 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -15,17 +15,18 @@ namespace homestore { VENUM(ReplServiceError, int32_t, OK = 0, // Everything OK CANCELLED = -1, // Request was cancelled - TIMEOUT = -2, - NOT_LEADER = -3, - BAD_REQUEST = -4, - SERVER_ALREADY_EXISTS = -5, + TIMEOUT = -2, + NOT_LEADER = -3, + BAD_REQUEST = -4, + SERVER_ALREADY_EXISTS = -5, CONFIG_CHANGING = -6, - SERVER_IS_JOINING = -7, - SERVER_NOT_FOUND = -8, - CANNOT_REMOVE_LEADER = -9, + SERVER_IS_JOINING = -7, + SERVER_NOT_FOUND = -8, + CANNOT_REMOVE_LEADER = -9, SERVER_IS_LEAVING = -10, - TERM_MISMATCH = -11, - RESULT_NOT_EXIST_YET = -10000, + TERM_MISMATCH = -11, + RETRY_REQUEST = -12, + RESULT_NOT_EXIST_YET = -10000, NOT_IMPLEMENTED = -10001, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 9965ada5d..15dc4872a 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -42,7 +42,8 @@ VENUM(repl_req_state_t, uint32_t, VENUM(journal_type_t, uint16_t, HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry - HS_CTRL_DESTROY = 2 // Control message to destroy the repl_dev + HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev + HS_CTRL_REPLACE = 3, // Control message to replace a member ) struct repl_key { @@ -346,6 +347,9 @@ class ReplDevListener { /// after restart in case crash happened during the destroy. virtual void on_destroy() = 0; + /// @brief Called when replace member is performed. + virtual void replace_member(replica_id_t member_out, replica_id_t member_in) = 0; + /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 71927a3ad..b8800afea 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -192,9 +192,10 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } + return fmt::format("dsn={} term={} lsn={} op={} local_blkid={} state=[{}]", m_rkey.dsn, m_rkey.term, m_lsn, enum_name(m_op_code), m_local_blkid.to_string(), req_state_name(uint32_cast(state()))); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 45a018d92..e928f8996 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -107,13 +107,94 @@ bool RaftReplDev::join_group() { m_msg_mgr.join_group(m_group_id, "homestore_replication", std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(shared_from_this())); if (!raft_result) { - HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", boost::uuids::to_string(m_group_id), - raft_result.error()); + HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", group_id_str(), raft_result.error()); return false; } return true; } +AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid) { + LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), + boost::uuids::to_string(member_out_uuid), boost::uuids::to_string(member_in_uuid)); + + // Step 1: Check if leader itself is requested to move out. + if (m_my_repl_id == member_out_uuid && m_my_repl_id == get_leader_id()) { + // If leader is the member requested to move out, then give up leadership and return error. + // Client will retry replace_member request to the new leader. + raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + RD_LOGI("Replace member leader is the member_out so yield leadership"); + return make_async_error<>(ReplServiceError::NOT_LEADER); + } + + // Step 2. Add the new member. + return m_msg_mgr.add_member(m_group_id, member_in_uuid) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member_in_uuid, member_out_uuid](auto&& e) -> AsyncReplResult<> { + // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout + // when adding member. Member is added to cluster config until member syncs fully + // with atleast stop gap. This will take a lot of time for block or + // object storage. + if (e.hasError()) { + // Ignore the server already exists as server already added to the cluster. + // The pg member change requests from control path are idemepotent and request + // can be resend and one of the add or remove can failed and has to retried. + if (e.error() == nuraft::cmd_result_code::CANCELLED || + e.error() == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { + RD_LOGW("Ignoring error returned from nuraft add_member {}", e.error()); + } else { + RD_LOGE("Replace member error in add member : {}", e.error()); + return make_async_error<>(RaftReplService::to_repl_error(e.error())); + } + } + auto member_out = boost::uuids::to_string(member_out_uuid); + auto member_in = boost::uuids::to_string(member_in_uuid); + + RD_LOGI("Replace member added member={} to group_id={}", member_in, group_id_str()); + + // Step 3. Append log entry to mark the old member is out and new member is added. + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + replace_members_ctx members; + std::copy(member_in_uuid.begin(), member_in_uuid.end(), members.in_replica_id.begin()); + std::copy(member_out_uuid.begin(), member_out_uuid.end(), members.out_replica_id.begin()); + sisl::blob header(r_cast< uint8_t* >(&members), + members.in_replica_id.size() + members.out_replica_id.size()); + rreq->init( + repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + LOGERROR("Replace member propose to raft failed {}", err); + return make_async_error<>(std::move(err)); + } + + RD_LOGI("Replace member proposed to raft group_id={}", group_id_str()); + + // Step 4. Remove the old member. Even if the old member is temporarily + // down and recovers, nuraft mesg see member remove from cluster log + // entry and call exit_group() and leave(). + return m_msg_mgr.rem_member(m_group_id, member_out_uuid) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member_out](auto&& e) -> AsyncReplResult<> { + if (e.hasError()) { + // Ignore the server not found as server removed from the cluster + // as requests are idempotent and can be resend. + if (e.error() == nuraft::cmd_result_code::SERVER_NOT_FOUND) { + RD_LOGW("Remove member not found in group error, ignoring"); + } else { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE("Replace member failed to remove member : {}", e.error()); + return make_async_error<>(ReplServiceError::RETRY_REQUEST); + } + } else { + RD_LOGI("Replace member removed member={} from group_id={}", member_out, group_id_str()); + } + return make_async_success<>(); + }); + }); +} + folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // Set the intent to destroy the group m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYING; }); @@ -141,7 +222,7 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { LOGERROR("RaftReplDev::destroy_group failed {}", err); } - LOGINFO("Raft repl dev destroy_group={}", boost::uuids::to_string(m_group_id)); + LOGINFO("Raft repl dev destroy_group={}", group_id_str()); return m_destroy_promise.getSemiFuture(); } @@ -786,6 +867,8 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { RD_LOGD("Raft channel: Commit rreq=[{}]", rreq->to_string()); if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { leave(); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { + replace_member(rreq); } else { m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); } @@ -820,7 +903,8 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) blkid.to_string()); }); } - } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { + } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY || + rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { if (rreq->is_proposer()) { m_destroy_promise.setValue(err); } } @@ -836,6 +920,17 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) rreq->clear(); } +void RaftReplDev::replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); + replica_id_t member_in, member_out; + std::copy(members->out_replica_id.begin(), members->out_replica_id.end(), member_out.begin()); + std::copy(members->in_replica_id.begin(), members->in_replica_id.end(), member_in.begin()); + RD_LOGI("Raft repl replace_member member_out={} member_in={}", boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); + + m_listener->replace_member(member_out, member_in); +} + static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { if (a.size() != b.size()) { return false; } return (std::memcmp(a.cbytes(), b.cbytes(), a.size()) == 0); @@ -971,12 +1066,14 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { std::unique_lock lg{m_config_mtx}; (*m_raft_config_sb)["config"] = serialize_cluster_config(config); m_raft_config_sb.write(); + RD_LOGI("Saved config {}", (*m_raft_config_sb)["config"].dump()); } void RaftReplDev::save_state(const nuraft::srv_state& state) { std::unique_lock lg{m_config_mtx}; (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}}; m_raft_config_sb.write(); + RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); } nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { @@ -1013,7 +1110,7 @@ uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; } void RaftReplDev::permanent_destroy() { - RD_LOGI("Permanent destroy for raft repl dev"); + RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); m_rd_sb.destroy(); m_raft_config_sb.destroy(); m_data_journal->remove_store(); @@ -1035,7 +1132,7 @@ void RaftReplDev::leave() { m_rd_sb->destroy_pending = 0x1; m_rd_sb.write(); - RD_LOGI("RaftReplDev leave group"); + RD_LOGI("RaftReplDev leave group_id={}", group_id_str()); m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 41594b528..82fdcaa23 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -35,6 +35,11 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); +struct replace_members_ctx { + std::array< uint8_t, 16 > out_replica_id; + std::array< uint8_t, 16 > in_replica_id; +}; + class RaftReplDevMetrics : public sisl::MetricsGroup { public: explicit RaftReplDevMetrics(const char* inst_name) : sisl::MetricsGroup("RaftReplDev", inst_name) { @@ -150,6 +155,7 @@ class RaftReplDev : public ReplDev, virtual ~RaftReplDev() = default; bool join_group(); + AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -268,6 +274,7 @@ class RaftReplDev : public ReplDev, bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void commit_blk(repl_req_ptr_t rreq); + void replace_member(repl_req_ptr_t rreq); }; } // namespace homestore diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 65d928390..bbf921685 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -93,7 +93,12 @@ void RaftReplService::start() { .with_hb_interval(HS_DYNAMIC_CONFIG(consensus.heartbeat_period_ms)) .with_max_append_size(HS_DYNAMIC_CONFIG(consensus.max_append_batch_size)) .with_log_sync_batch_size(HS_DYNAMIC_CONFIG(consensus.log_sync_batch_size)) + // TODO to fix the log_gap thresholds when adding new member. + // When the option is enabled, new member is doing log sync is stuck after the first batch + // where if the option is disabled, new member is going through append entries and it works. +#if 0 .with_log_sync_stopping_gap(HS_DYNAMIC_CONFIG(consensus.min_log_gap_to_join)) +#endif .with_stale_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_hi_threshold)) .with_fresh_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_lo_threshold)) .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) @@ -327,7 +332,16 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in) const { - return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } + + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->replace_member(member_out, member_in) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { return make_async_error<>(e.error()); } + return make_async_success<>(); + }); } ////////////////////// Reaper Thread related ////////////////////////////////// diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index a38cbbccb..cba90e2e0 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -80,7 +80,6 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); - }; class RaftReplServiceCPHandler : public CPCallbacks { diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index d922f71cb..ff6e9296a 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -114,6 +114,10 @@ if (${io_tests}) target_sources(test_raft_repl_dev PRIVATE test_raft_repl_dev.cpp) target_link_libraries(test_raft_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_raft_repl_dev_dynamic) + target_sources(test_raft_repl_dev_dynamic PRIVATE test_raft_repl_dev_dynamic.cpp) + target_link_libraries(test_raft_repl_dev_dynamic homestore ${COMMON_TEST_DEPS} GTest::gmock) + can_build_epoll_io_tests(epoll_tests) if(${epoll_tests}) add_test(NAME LogDev-Epoll COMMAND test_log_dev) @@ -122,6 +126,7 @@ if (${io_tests}) add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) + add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() @@ -134,6 +139,7 @@ if (${io_tests}) add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") + add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") if(${epoll_tests}) SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 67abe2f8e..672acffcb 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -38,6 +38,8 @@ SISL_OPTION_GROUP(test_repl_common_setup, (replicas, "", "replicas", "Total number of replicas", ::cxxopts::value< uint32_t >()->default_value("3"), "number"), + (spare_replicas, "", "spare_replicas", "Additional number of spare replicas not part of repldev", + ::cxxopts::value< uint32_t >()->default_value("1"), "number"), (base_port, "", "base_port", "Port number of first replica", ::cxxopts::value< uint16_t >()->default_value("4000"), "number"), (replica_num, "", "replica_num", @@ -134,11 +136,12 @@ class HSReplTestHelper : public HSTestHelper { HSReplTestHelper(std::string const& name, std::vector< std::string > const& args, char** argv) : name_{name}, args_{args}, argv_{argv} {} - void setup() { + void setup(uint32_t num_replicas) { + num_replicas_ = num_replicas; replica_num_ = SISL_OPTIONS["replica_num"].as< uint16_t >(); + sisl::logging::SetLogger(name_ + std::string("_replica_") + std::to_string(replica_num_)); sisl::logging::SetLogPattern("[%D %T%z] [%^%L%$] [%n] [%t] %v"); - auto const num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); boost::uuids::string_generator gen; for (uint32_t i{0}; i < num_replicas; ++i) { @@ -226,7 +229,7 @@ class HSReplTestHelper : public HSTestHelper { void reset_setup() { teardown(); - setup(); + setup(num_replicas_); } void restart(uint32_t shutdown_delay_secs = 5u) { @@ -273,8 +276,12 @@ class HSReplTestHelper : public HSTestHelper { if (replica_num_ == 0) { std::set< homestore::replica_id_t > members; - std::transform(members_.begin(), members_.end(), std::inserter(members, members.end()), - [](auto const& p) { return p.first; }); + // By default we create repl dev with number of members equal to replicas argument. + // We dont add spare replica's to the group by default. + for (auto& m : members_) { + if (m.second < SISL_OPTIONS["replicas"].as< uint32_t >()) { members.insert(m.first); } + } + group_id_t repl_group_id = hs_utils::gen_random_uuid(); { std::unique_lock lg(groups_mtx_); @@ -299,6 +306,7 @@ class HSReplTestHelper : public HSTestHelper { auto listener = std::move(pending_listeners_[0]); repl_groups_.insert(std::pair(group_id, listener)); pending_listeners_.erase(pending_listeners_.begin()); + LOGINFO("Got listener for group_id={} replica={}", boost::uuids::to_string(group_id), replica_num_); return listener; } @@ -346,6 +354,7 @@ class HSReplTestHelper : public HSTestHelper { std::string name_; std::vector< std::string > args_; char** argv_; + uint32_t num_replicas_; std::vector< homestore::dev_info > dev_list_; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp new file mode 100644 index 000000000..7b96afa4c --- /dev/null +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -0,0 +1,629 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/homestore_config.hpp" +#include "common/homestore_assert.hpp" +#include "common/homestore_utils.hpp" + +#define private public +#include "test_common/hs_repl_test_common.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" + +using namespace homestore; + +SISL_LOGGING_DEF(test_raft_repl_dev) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg) + +SISL_OPTION_GROUP(test_raft_repl_dev, + (block_size, "", "block_size", "block size to io", + ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), + (num_raft_groups, "", "num_raft_groups", "number of raft groups per test", + ::cxxopts::value< uint32_t >()->default_value("1"), "number"), + // for below replication parameter, their default value always get from dynamic config, only used + // when specified by user + (snapshot_distance, "", "snapshot_distance", "distance between snapshots", + ::cxxopts::value< uint32_t >()->default_value("0"), "number"), + (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved", + ::cxxopts::value< uint32_t >()->default_value("0"), "number"), + (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer", + ::cxxopts::value< uint32_t >()->default_value("0"), "number")); + +SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup) + +static std::unique_ptr< test_common::HSReplTestHelper > g_helper; +static std::random_device g_rd{}; +static std::default_random_engine g_re{g_rd()}; + +class TestReplicatedDB : public homestore::ReplDevListener { +public: + struct Key { + uint64_t id_; + bool operator<(Key const& other) const { return id_ < other.id_; } + }; + + struct Value { + int64_t lsn_; + uint64_t data_size_; + uint64_t data_pattern_; + MultiBlkId blkid_; + uint64_t id_; + }; + + struct KeyValuePair { + Key key; + Value value; + }; + + struct test_req : public repl_req_ctx { + struct journal_header { + uint64_t data_size; + uint64_t data_pattern; + }; + + journal_header jheader; + uint64_t key_id; + sisl::sg_list write_sgs; + sisl::sg_list read_sgs; + + sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } + sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } + + test_req() { + write_sgs.size = 0; + read_sgs.size = 0; + key_id = (uint64_t)rand() << 32 | rand(); + } + + ~test_req() { + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + + for (auto const& iov : read_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + }; + + TestReplicatedDB() = default; + virtual ~TestReplicatedDB() = default; + + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, + cintrusive< repl_req_ctx >& ctx) override { + ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); + + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); + Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; + Value v{.lsn_ = lsn, + .data_size_ = jheader->data_size, + .data_pattern_ = jheader->data_pattern, + .blkid_ = blkids, + .id_ = k.id_}; + + LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", + g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_); + + { + std::unique_lock lk(db_mtx_); + inmem_db_.insert_or_assign(k, v); + lsn_index_.emplace(lsn, v); + last_committed_lsn = lsn; + ++commit_count_; + } + + if (ctx->is_proposer()) { g_helper->runner().next_task(); } + } + + bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, + ctx->dsn()); + return true; + } + + void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); + } + + void on_restart() { + LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(), + boost::uuids::to_string(repl_dev()->group_id())); + } + + void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), + *(r_cast< uint64_t const* >(key.cbytes()))); + } + + AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + m_last_snapshot = context; + return make_async_success<>(); + } + + int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + + if (snp_data->offset == 0) { + snp_data->is_last_obj = false; + snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); + LOGINFOMOD(replication, + "[Replica={}] Read logical snapshot callback first message obj_id={} term={} idx={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx()); + return 0; + } + + int64_t next_lsn = snp_data->offset; + std::vector< KeyValuePair > kv_snapshot_data; + // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into + // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the + // first element to be read and transfered. + for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { + auto& v = iter->second; + kv_snapshot_data.emplace_back(Key{v.id_}, v); + LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", + g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); + if (kv_snapshot_data.size() >= 1000) { break; } + } + + if (kv_snapshot_data.size() == 0) { + snp_data->is_last_obj = true; + LOGINFOMOD(replication, "Snapshot is_last_obj is true"); + return 0; + } + + int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size(); + sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)}; + std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size); + snp_data->blob = std::move(blob); + snp_data->is_last_obj = false; + LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), + kv_snapshot_data.size()); + + return 0; + } + + void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); + auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); + std::move(fut).get(); + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + + void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + auto last_committed_idx = + std::dynamic_pointer_cast< RaftReplDev >(repl_dev())->raft_server()->get_committed_log_idx(); + if (snp_data->offset == 0) { + snp_data->offset = last_committed_lsn + 1; + LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", + g_helper->replica_num(), snp_data->offset); + return; + } + + size_t kv_snapshot_data_size = snp_data->blob.size(); + if (kv_snapshot_data_size == 0) return; + + size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair); + std::unique_lock lk(db_mtx_); + auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); + for (size_t i = 0; i < num_items; i++) { + auto key = ptr->key; + auto value = ptr->value; + LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}", + g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_); + + // Write to data service and inmem map. + MultiBlkId out_blkids; + if (value.data_size_ != 0) { + snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids); + value.blkid_ = out_blkids; + } + inmem_db_.insert_or_assign(key, value); + last_committed_lsn = value.lsn_; + ++commit_count_; + ptr++; + } + + snp_data->offset = last_committed_lsn + 1; + LOGINFOMOD(replication, + "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), + snp_data->is_last_obj, num_items); + } + + bool apply_snapshot(shared< snapshot_context > context) override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + m_last_snapshot = context; + return true; + } + + shared< snapshot_context > last_snapshot() override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + if (!m_last_snapshot) return nullptr; + + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + return m_last_snapshot; + } + + void free_user_snp_ctx(void*& user_snp_ctx) override {} + + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + return blk_alloc_hints{}; + } + void replace_member(replica_id_t member_out, replica_id_t member_in) override {} + + void on_destroy() override { + LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), + boost::uuids::to_string(repl_dev()->group_id())); + g_helper->unregister_listener(repl_dev()->group_id()); + } + + void db_write(uint64_t data_size, uint32_t max_size_per_iov) { + static std::atomic< uint32_t > s_uniq_num{0}; + auto req = intrusive< test_req >(new test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", + g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); + } + + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + } + + void validate_db_data() { + g_helper->runner().set_num_tasks(inmem_db_.size()); + + LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", + boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); + auto it = inmem_db_.begin(); + g_helper->runner().set_task([this, &it]() { + Key k; + Value v; + { + std::unique_lock lk(db_mtx_); + std::tie(k, v) = *it; + ++it; + } + + if (v.data_size_ != 0) { + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); + + repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { + LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(), + v.data_pattern_); + RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, + ec.message()); + for (auto const& iov : read_sgs.iovs) { + test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, + v.data_pattern_); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + g_helper->runner().next_task(); + }); + } else { + g_helper->runner().next_task(); + } + }); + g_helper->runner().execute().get(); + } + + uint64_t db_commit_count() const { + std::shared_lock lk(db_mtx_); + return commit_count_; + } + + uint64_t db_size() const { + std::shared_lock lk(db_mtx_); + return inmem_db_.size(); + } + + void create_snapshot() { + auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); + ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot(); + LOGINFO("Manually create snapshot got index {}", snapshot_idx); + } + + void truncate(int num_reserved_entries) { + auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); + raft_repl_dev->truncate(num_reserved_entries); + LOGINFO("Manually truncated"); + } + + void set_zombie() { zombie_ = true; } + bool is_zombie() { + // Wether a group is zombie(non recoverable) + return zombie_; + } + +private: + std::map< Key, Value > inmem_db_; + std::map< int64_t, Value > lsn_index_; + uint64_t commit_count_{0}; + std::shared_mutex db_mtx_; + uint64_t last_committed_lsn{0}; + std::shared_ptr< snapshot_context > m_last_snapshot{nullptr}; + std::mutex m_snapshot_lock; + bool zombie_{false}; +}; + +class RaftReplDevTestBase : public testing::Test { +public: + void SetUp() override { + // By default it will create one db + for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) { + auto db = std::make_shared< TestReplicatedDB >(); + g_helper->register_listener(db); + dbs_.emplace_back(std::move(db)); + } + } + + void TearDown() override { + for (auto const& db : dbs_) { + if (db->is_zombie()) { continue; } + run_on_leader(db, [this, db]() { + auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); + ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; + }); + } + + for (auto const& db : dbs_) { + if (db->is_zombie()) { continue; } + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + int i = 0; + bool force_leave = false; + do { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed"); + + // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be + // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at + // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now. + if (i++ > 10 && !force_leave) { + LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave"); + repl_dev->force_leave(); + force_leave = true; + } + + } while (!repl_dev->is_destroyed()); + } + } + + void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) { + if (db == nullptr) { db = pick_one_db(); } + // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id()); + db->db_write(data_size, max_size_per_iov); + } + + void wait_for_all_commits() { wait_for_commits(written_entries_); } + + void wait_for_commits(uint64_t exp_writes) { + uint64_t total_writes{0}; + while (true) { + total_writes = 0; + for (auto const& db : dbs_) { + total_writes += db->db_commit_count(); + } + + if (total_writes >= exp_writes) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + LOGINFO("Replica={} received {} commits but expected {}", g_helper->replica_num(), total_writes, + exp_writes); + } + LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes); + } + + void validate_data() { + for (auto const& db : dbs_) { + db->validate_db_data(); + } + } + + shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; } + + void assign_leader(uint16_t replica) { + LOGINFO("Switch the leader to replica_num = {}", replica); + if (g_helper->replica_num() == replica) { + for (auto const& db : dbs_) { + do { + auto result = db->repl_dev()->become_leader().get(); + if (result.hasError()) { + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } else { + break; + } + } while (true); + } + } else { + for (auto const& db : dbs_) { + homestore::replica_id_t leader_uuid; + while (true) { + leader_uuid = db->repl_dev()->get_leader_id(); + if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; } + + LOGINFO("Waiting for replica={} to become leader", replica); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } + } + } + } + + void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { + do { + auto leader_uuid = db->repl_dev()->get_leader_id(); + + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id()); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else if (leader_uuid == g_helper->my_replica_id()) { + lambda(); + break; + } else { + break; + } + } while (true); + } + + void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + do { + auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); + + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else if (leader_uuid == g_helper->my_replica_id()) { + LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, + boost::uuids::to_string(g_helper->my_replica_id())); + auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + g_helper->runner().set_num_tasks(num_entries); + + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_task([this, block_size, db]() { + static std::normal_distribution<> num_blks_gen{3.0, 2.0}; + this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db); + }); + if (wait_for_commit) { g_helper->runner().execute().get(); } + break; + } else { + LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries, + boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); + break; + } + } while (true); + + written_entries_ += num_entries; + if (wait_for_commit) { this->wait_for_all_commits(); } + } + + void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { + this->run_on_leader(db, [this, db]() { + auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); + ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; + }); + + // Remove the db from the dbs_ list and check if count matches with repl_device + for (auto it = dbs_.begin(); it != dbs_.end(); ++it) { + if (*it == db) { + dbs_.erase(it); + break; + } + } + + if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); } + } + + void wait_for_listener_destroy(uint64_t exp_listeners) { + while (true) { + auto total_listeners = g_helper->num_listeners(); + if (total_listeners == exp_listeners) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + + void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) { + if (g_helper->replica_num() == replica) { + LOGINFO("Restart homestore: replica_num = {}", replica); + g_helper->restart(shutdown_delay_sec); + // g_helper->sync_for_test_start(); + } else { + LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); + std::this_thread::sleep_for(std::chrono::seconds{5}); + } + } + + void shutdown_replica(uint16_t replica) { + if (g_helper->replica_num() == replica) { + LOGINFO("Shutdown homestore: replica_num = {}", replica); + g_helper->shutdown(); + } else { + LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); + std::this_thread::sleep_for(std::chrono::seconds{5}); + } + } + + void start_replica(uint16_t replica) { + if (g_helper->replica_num() == replica) { + LOGINFO("Start homestore: replica_num = {}", replica); + g_helper->start(); + } + } + + void create_snapshot() { dbs_[0]->create_snapshot(); } + void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } + + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in) { + this->run_on_leader(db, [this, db, member_out, member_in]() { + LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); + auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), member_out, member_in).get(); + ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; + }); + } + +protected: + std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; + uint32_t written_entries_{0}; + +#ifdef _PRERELEASE + flip::FlipClient m_fc{iomgr_flip::instance()}; +#endif +}; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index f8aa06c5c..9ccc40dfc 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -12,606 +12,9 @@ * specific language governing permissions and limitations under the License. * *********************************************************************************/ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include "common/homestore_config.hpp" -#include "common/homestore_assert.hpp" -#include "common/homestore_utils.hpp" - -#define private public -#include "test_common/hs_repl_test_common.hpp" -#include "replication/service/raft_repl_service.h" -#include "replication/repl_dev/raft_repl_dev.h" - -using namespace homestore; - -SISL_OPTION_GROUP(test_raft_repl_dev, - (block_size, "", "block_size", "block size to io", - ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), - (num_raft_groups, "", "num_raft_groups", "number of raft groups per test", - ::cxxopts::value< uint32_t >()->default_value("1"), "number"), - // for below replication parameter, their default value always get from dynamic config, only used - // when specified by user - (snapshot_distance, "", "snapshot_distance", "distance between snapshots", - ::cxxopts::value< uint32_t >()->default_value("0"), "number"), - (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved", - ::cxxopts::value< uint32_t >()->default_value("0"), "number"), - (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer", - ::cxxopts::value< uint32_t >()->default_value("0"), "number")); - -SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup) - -static std::unique_ptr< test_common::HSReplTestHelper > g_helper; -static std::random_device g_rd{}; -static std::default_random_engine g_re{g_rd()}; - -class TestReplicatedDB : public homestore::ReplDevListener { -public: - struct Key { - uint64_t id_; - bool operator<(Key const& other) const { return id_ < other.id_; } - }; - - struct Value { - int64_t lsn_; - uint64_t data_size_; - uint64_t data_pattern_; - MultiBlkId blkid_; - uint64_t id_; - }; - - struct KeyValuePair { - Key key; - Value value; - }; - - struct test_req : public repl_req_ctx { - struct journal_header { - uint64_t data_size; - uint64_t data_pattern; - }; - - journal_header jheader; - uint64_t key_id; - sisl::sg_list write_sgs; - sisl::sg_list read_sgs; - - sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } - sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } - - test_req() { - write_sgs.size = 0; - read_sgs.size = 0; - key_id = (uint64_t)rand() << 32 | rand(); - } - - ~test_req() { - for (auto const& iov : write_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - - for (auto const& iov : read_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - } - }; - - TestReplicatedDB() = default; - virtual ~TestReplicatedDB() = default; - - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { - ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); - - auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); - Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; - Value v{.lsn_ = lsn, - .data_size_ = jheader->data_size, - .data_pattern_ = jheader->data_pattern, - .blkid_ = blkids, - .id_ = k.id_}; - - LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", - g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_); - - { - std::unique_lock lk(db_mtx_); - inmem_db_.insert_or_assign(k, v); - lsn_index_.emplace(lsn, v); - last_data_committed_lsn = lsn; - ++commit_count_; - } - - if (ctx->is_proposer()) { g_helper->runner().next_task(); } - } - - bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, - ctx->dsn()); - return true; - } - - void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); - } - - void on_restart() { - LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - } - - void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), - *(r_cast< uint64_t const* >(key.cbytes()))); - } - - AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - m_last_snapshot = context; - return make_async_success<>(); - } - - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - - if (snp_data->offset == 0) { - snp_data->is_last_obj = false; - snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); - LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx()); - return 0; - } - - int64_t next_lsn = snp_data->offset; - std::vector< KeyValuePair > kv_snapshot_data; - // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into - // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the - // first element to be read and transfered. - for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { - auto& v = iter->second; - kv_snapshot_data.emplace_back(Key{v.id_}, v); - LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", - g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); - if (kv_snapshot_data.size() >= 1000) { break; } - } - - if (kv_snapshot_data.size() == 0) { - snp_data->is_last_obj = true; - LOGINFOMOD(replication, "Snapshot is_last_obj is true"); - return 0; - } - - int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size(); - sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)}; - std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size); - snp_data->blob = std::move(blob); - snp_data->is_last_obj = false; - LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - kv_snapshot_data.size()); - - return 0; - } - - void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); - auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); - std::move(fut).get(); - for (auto const& iov : write_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - } - - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - if (snp_data->offset == 0) { - snp_data->offset = last_data_committed_lsn + 1; - LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", - g_helper->replica_num(), snp_data->offset); - return; - } - - size_t kv_snapshot_data_size = snp_data->blob.size(); - if (kv_snapshot_data_size == 0) return; - - size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair); - std::unique_lock lk(db_mtx_); - auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); - for (size_t i = 0; i < num_items; i++) { - auto key = ptr->key; - auto value = ptr->value; - LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}", - g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_); - - // Write to data service and inmem map. - MultiBlkId out_blkids; - if (value.data_size_ != 0) { - snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids); - value.blkid_ = out_blkids; - } - last_data_committed_lsn = value.lsn_; - inmem_db_.insert_or_assign(key, value); - ++commit_count_; - ptr++; - } - - LOGINFOMOD(replication, - "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - snp_data->is_last_obj, num_items); - - // before we finish install snapshot, raft_server()->get_committed_log_idx() will always be the same. so we need - // last_data_committed_lsn to notify leader to transfer new data to follower. - snp_data->offset = last_data_committed_lsn + 1; - } - - bool apply_snapshot(shared< snapshot_context > context) override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - m_last_snapshot = context; - return true; - } - - shared< snapshot_context > last_snapshot() override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - if (!m_last_snapshot) return nullptr; - - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - return m_last_snapshot; - } - - void free_user_snp_ctx(void*& user_snp_ctx) override {} - - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { - return blk_alloc_hints{}; - } - - void on_destroy() override { - LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - g_helper->unregister_listener(repl_dev()->group_id()); - } - - void db_write(uint64_t data_size, uint32_t max_size_per_iov) { - static std::atomic< uint32_t > s_uniq_num{0}; - auto req = intrusive< test_req >(new test_req()); - req->jheader.data_size = data_size; - req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - - LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", - g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); - - if (data_size != 0) { - req->write_sgs = - test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); - } - - repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); - } - - void validate_db_data() { - g_helper->runner().set_num_tasks(inmem_db_.size()); - - LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", - boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); - auto it = inmem_db_.begin(); - g_helper->runner().set_task([this, &it]() { - Key k; - Value v; - { - std::unique_lock lk(db_mtx_); - std::tie(k, v) = *it; - ++it; - } - - if (v.data_size_ != 0) { - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); - - repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { - LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(), - v.data_pattern_); - RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, - ec.message()); - for (auto const& iov : read_sgs.iovs) { - test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, - v.data_pattern_); - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - g_helper->runner().next_task(); - }); - } else { - g_helper->runner().next_task(); - } - }); - g_helper->runner().execute().get(); - } +#include "test_common/raft_repl_test_base.hpp" - uint64_t db_commit_count() const { - std::shared_lock lk(db_mtx_); - return commit_count_; - } - - uint64_t db_size() const { - std::shared_lock lk(db_mtx_); - return inmem_db_.size(); - } - - void create_snapshot() { - auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot(); - LOGINFO("Manually create snapshot got index {}", snapshot_idx); - } - - void truncate(int num_reserved_entries) { - auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - raft_repl_dev->truncate(num_reserved_entries); - LOGINFO("Manually truncated"); - } - - void set_zombie() { zombie_ = true; } - bool is_zombie() { - // Wether a group is zombie(non recoverable) - return zombie_; - } - -private: - std::map< Key, Value > inmem_db_; - std::map< int64_t, Value > lsn_index_; - uint64_t commit_count_{0}; - // this is the last lsn for data, might not be the same with the real last committed lsn - // which should be get by raft_server()->get_committed_log_idx() - uint64_t last_data_committed_lsn{0}; - std::shared_mutex db_mtx_; - std::shared_ptr< snapshot_context > m_last_snapshot{nullptr}; - std::mutex m_snapshot_lock; - bool zombie_{false}; -}; - -class RaftReplDevTest : public testing::Test { -public: - void SetUp() override { - // By default it will create one db - for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) { - auto db = std::make_shared< TestReplicatedDB >(); - g_helper->register_listener(db); - dbs_.emplace_back(std::move(db)); - } - } - - void TearDown() override { - for (auto const& db : dbs_) { - if (db->is_zombie()) { continue; } - run_on_leader(db, [this, db]() { - auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); - ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; - }); - } - - for (auto const& db : dbs_) { - if (db->is_zombie()) { continue; } - auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); - int i = 0; - bool force_leave = false; - do { - std::this_thread::sleep_for(std::chrono::seconds(1)); - auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); - raft_repl_svc.gc_repl_devs(); - LOGINFO("Waiting for repl dev to get destroyed"); - - // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be - // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at - // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now. - if (i++ > 10 && !force_leave) { - LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave"); - repl_dev->force_leave(); - force_leave = true; - } - - } while (!repl_dev->is_destroyed()); - } - } - - void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) { - if (db == nullptr) { db = pick_one_db(); } - // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id()); - db->db_write(data_size, max_size_per_iov); - } - - void wait_for_all_commits() { wait_for_commits(written_entries_); } - - void wait_for_commits(uint64_t exp_writes) { - uint64_t total_writes{0}; - while (true) { - total_writes = 0; - for (auto const& db : dbs_) { - total_writes += db->db_commit_count(); - } - - if (total_writes >= exp_writes) { break; } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes); - } - - void validate_data() { - for (auto const& db : dbs_) { - db->validate_db_data(); - } - } - - shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; } - - void assign_leader(uint16_t replica) { - LOGINFO("Switch the leader to replica_num = {}", replica); - if (g_helper->replica_num() == replica) { - for (auto const& db : dbs_) { - do { - auto result = db->repl_dev()->become_leader().get(); - if (result.hasError()) { - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } else { - break; - } - } while (true); - } - } else { - for (auto const& db : dbs_) { - homestore::replica_id_t leader_uuid; - while (true) { - leader_uuid = db->repl_dev()->get_leader_id(); - if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; } - - LOGINFO("Waiting for replica={} to become leader", replica); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } - } - } - } - - void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { - do { - auto leader_uuid = db->repl_dev()->get_leader_id(); - - if (leader_uuid.is_nil()) { - LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id()); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } else if (leader_uuid == g_helper->my_replica_id()) { - lambda(); - break; - } else { - break; - } - } while (true); - } - - void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { - do { - auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); - - if (leader_uuid.is_nil()) { - LOGINFO("Waiting for leader to be elected"); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } else if (leader_uuid == g_helper->my_replica_id()) { - LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, - boost::uuids::to_string(g_helper->my_replica_id())); - auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - g_helper->runner().set_num_tasks(num_entries); - - LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); - g_helper->runner().set_task([this, block_size, db]() { - static std::normal_distribution<> num_blks_gen{3.0, 2.0}; - this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db); - }); - if (wait_for_commit) { g_helper->runner().execute().get(); } - break; - } else { - LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries, - boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); - break; - } - } while (true); - - written_entries_ += num_entries; - if (wait_for_commit) { this->wait_for_all_commits(); } - } - - void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { - this->run_on_leader(db, [this, db]() { - auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); - ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; - }); - - // Remove the db from the dbs_ list and check if count matches with repl_device - for (auto it = dbs_.begin(); it != dbs_.end(); ++it) { - if (*it == db) { - dbs_.erase(it); - break; - } - } - - if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); } - } - - void wait_for_listener_destroy(uint64_t exp_listeners) { - while (true) { - auto total_listeners = g_helper->num_listeners(); - if (total_listeners == exp_listeners) { break; } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - } - - void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) { - if (g_helper->replica_num() == replica) { - LOGINFO("Restart homestore: replica_num = {}", replica); - g_helper->restart(shutdown_delay_sec); - // g_helper->sync_for_test_start(); - } else { - LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); - std::this_thread::sleep_for(std::chrono::seconds{5}); - } - } - - void shutdown_replica(uint16_t replica) { - if (g_helper->replica_num() == replica) { - LOGINFO("Shutdown homestore: replica_num = {}", replica); - g_helper->shutdown(); - } else { - LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); - std::this_thread::sleep_for(std::chrono::seconds{5}); - } - } - - void start_replica(uint16_t replica) { - if (g_helper->replica_num() == replica) { - LOGINFO("Start homestore: replica_num = {}", replica); - g_helper->start(); - } - } - - void create_snapshot() { dbs_[0]->create_snapshot(); } - void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - -protected: - std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; - uint32_t written_entries_{0}; - -#ifdef _PRERELEASE - flip::FlipClient m_fc{iomgr_flip::instance()}; -#endif -}; +class RaftReplDevTest : public RaftReplDevTestBase {}; TEST_F(RaftReplDevTest, Write_Restart_Write) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); @@ -1012,7 +415,6 @@ int main(int argc, char* argv[]) { // Snapshot and truncation tests needs num reserved to be 0 and distance 10. s.consensus.num_reserved_log_items = 0; - s.consensus.snapshot_freq_distance = 10; s.resource_limits.resource_audit_timer_ms = 0; // only reset when user specified the value for test; @@ -1030,7 +432,8 @@ int main(int argc, char* argv[]) { FLAGS_folly_global_cpu_executor_threads = 4; g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev", args, orig_argv); - g_helper->setup(); + // No spare replica's are created. Test cases in this file expects fixed number of replica's. + g_helper->setup(SISL_OPTIONS["replicas"].as< uint32_t >()); auto ret = RUN_ALL_TESTS(); g_helper->teardown(); diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp new file mode 100644 index 000000000..7bd69a13c --- /dev/null +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -0,0 +1,133 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include "test_common/raft_repl_test_base.hpp" + +// Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev. +class ReplDevDynamicTest : public RaftReplDevTestBase {}; + +TEST_F(ReplDevDynamicTest, ReplaceMember) { + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + if (g_helper->replica_num() < num_replicas) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() != member_out) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } else { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + do { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } while (!repl_dev->is_destroyed()); + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("ReplaceMember test done"); +} + +// TODO add more tests with leader and member restart, multiple member replace +// leader replace, commit quorum + +int main(int argc, char* argv[]) { + int parsed_argc = argc; + char** orig_argv = argv; + + // Save the args for replica use + std::vector< std::string > args; + for (int i = 0; i < argc; ++i) { + args.emplace_back(argv[i]); + } + + ::testing::InitGoogleTest(&parsed_argc, argv); + + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, config, test_raft_repl_dev, iomgr, test_common_setup, + test_repl_common_setup); + + // + // Entire test suite assumes that once a replica takes over as leader, it stays until it is explicitly yielded. + // Otherwise it is very hard to control or accurately test behavior. Hence we forcibly override the + // leadership_expiry time. + // + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.consensus.leadership_expiry_ms = -1; // -1 means never expires; + s.generic.repl_dev_cleanup_interval_sec = 1; + + // Disable implicit flush and timer. + s.logstore.flush_threshold_size = 0; + s.logstore.flush_timer_frequency_us = 0; + + // Snapshot and truncation tests needs num reserved to be 0 and distance 10. + s.consensus.num_reserved_log_items = 0; + s.resource_limits.resource_audit_timer_ms = 0; + + // only reset when user specified the value for test; + if (SISL_OPTIONS.count("snapshot_distance")) { + s.consensus.snapshot_freq_distance = SISL_OPTIONS["snapshot_distance"].as< uint32_t >(); + } + if (SISL_OPTIONS.count("num_raft_logs_resv")) { + s.resource_limits.raft_logstore_reserve_threshold = SISL_OPTIONS["num_raft_logs_resv"].as< uint32_t >(); + } + if (SISL_OPTIONS.count("res_mgr_audit_timer_ms")) { + s.resource_limits.resource_audit_timer_ms = SISL_OPTIONS["res_mgr_audit_timer_ms"].as< uint32_t >(); + } + }); + HS_SETTINGS_FACTORY().save(); + + FLAGS_folly_global_cpu_executor_threads = 4; + g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev_dynamic", args, orig_argv); + + // We spawn spare replica's also for dynamic repl dev tests. + auto total_replicas = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + g_helper->setup(total_replicas); + + auto ret = RUN_ALL_TESTS(); + g_helper->teardown(); + + std::string str; + sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) { + fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive); + }); + LOGINFO("Object Life Counter\n:{}", str); + + return ret; +} diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index c2b2460b5..c358f71ce 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -135,6 +135,7 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } + void replace_member(replica_id_t member_out, replica_id_t member_in) override {} void on_destroy() override {} }; From 6338520e9af792a23370dc57d1cfc28b65eb58ac Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sun, 22 Sep 2024 11:47:29 +0800 Subject: [PATCH 002/130] Generalize and introduce Sealer into CP. Sealer is a special consumer that provides information regarding where the cp is up to. It will be the first one during cp switch over , as a conservative marker of everything before or equals to this point, should be in current cp, possibly some consumer are above this point which is fine. And Sealer is the last one during cp flush after all other services flushed successfully. Signed-off-by: Xiaoxi Chen --- src/lib/checkpoint/cp_mgr.cpp | 6 --- .../replication/repl_dev/raft_repl_dev.cpp | 33 +++++++++++++--- src/lib/replication/repl_dev/raft_repl_dev.h | 9 ++++- .../replication/service/raft_repl_service.cpp | 39 +++++++++++++++++-- .../replication/service/raft_repl_service.h | 17 ++++++++ 5 files changed, 89 insertions(+), 15 deletions(-) diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 33d22090a..62a28596c 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -239,12 +239,6 @@ void CPManager::cp_start_flush(CP* cp) { } folly::collectAllUnsafe(futs).thenValue([this, cp](auto) { -#ifdef _PRERELEASE - if (hs()->crash_simulator().is_in_crashing_phase()) { - on_cp_flush_done(cp); - return; - } -#endif // Sync flushing replication svc at last as the cp_lsn updated here // other component should at least flushed to cp_lsn auto& repl_cp = m_cp_cb_table[(size_t)cp_consumer_t::REPLICATION_SVC]; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index e928f8996..088270de0 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1189,9 +1189,10 @@ void RaftReplDev::flush_durable_commit_lsn() { } /////////////////////////////////// Private metohds //////////////////////////////////// -void RaftReplDev::cp_flush(CP* cp) { - auto const lsn = m_commit_upto_lsn.load(); - auto const clsn = m_compact_lsn.load(); +void RaftReplDev::cp_flush(CP* cp, cshared ctx) { + auto const lsn = ctx->cp_lsn; + auto const clsn = ctx->compacted_to_lsn; + auto const dsn = ctx->last_applied_dsn; if (lsn == m_last_flushed_commit_lsn) { // Not dirtied since last flush ignore @@ -1200,15 +1201,31 @@ void RaftReplDev::cp_flush(CP* cp) { std::unique_lock lg{m_sb_mtx}; m_rd_sb->compact_lsn = clsn; - m_rd_sb->durable_commit_lsn = lsn; + // dc_lsn is also flushed in flush_durable_commit_lsn() + // we need to take a max to avoid rolling back. + m_rd_sb->durable_commit_lsn = std::max(lsn, m_rd_sb->durable_commit_lsn); m_rd_sb->checkpoint_lsn = lsn; - m_rd_sb->last_applied_dsn = m_next_dsn.load(); + m_rd_sb->last_applied_dsn = dsn; m_rd_sb.write(); m_last_flushed_commit_lsn = lsn; RD_LOGD("cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, m_next_dsn.load(), cp->to_string()); } +cshared RaftReplDev::get_cp_ctx(CP* cp) { + auto const cp_lsn = m_commit_upto_lsn.load(); + auto const clsn = m_compact_lsn.load(); + auto const dsn = m_next_dsn.load(); + + RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", + (void *)this, cp_lsn, clsn, dsn, cp->to_string()); + auto dev_ctx = std::make_shared(); + dev_ctx->cp_lsn = cp_lsn; + dev_ctx->compacted_to_lsn = clsn; + dev_ctx->last_applied_dsn = dsn; + return dev_ctx; +} + void RaftReplDev::cp_cleanup(CP*) {} void RaftReplDev::gc_repl_reqs() { @@ -1300,6 +1317,12 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx // keep lentry in scope for the lyfe cycle of the rreq rreq->set_lentry(lentry); rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size); + // we load the log from log device, implies log flushed. We only flush log after data is written to data device. + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->add_state(repl_req_state_t::DATA_RECEIVED); + rreq->add_state(repl_req_state_t::DATA_WRITTEN); + rreq->add_state(repl_req_state_t::LOG_RECEIVED); + rreq->add_state(repl_req_state_t::LOG_FLUSHED); RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string()); if (repl_lsn > m_rd_sb->durable_commit_lsn) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 82fdcaa23..e2e95550d 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -107,6 +107,12 @@ class RaftReplDevMetrics : public sisl::MetricsGroup { class RaftReplService; class CP; +struct ReplDevCPContext { + repl_lsn_t cp_lsn; + repl_lsn_t compacted_to_lsn; + uint64_t last_applied_dsn; +}; + class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr, public std::enable_shared_from_this< RaftReplDev > { @@ -192,7 +198,8 @@ class RaftReplDev : public ReplDev, sisl::blob const& key, uint32_t data_size, bool is_data_channel); folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs); - void cp_flush(CP* cp); + void cp_flush(CP* cp, cshared ctx); + cshared get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index bbf921685..974984ca3 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -448,11 +448,44 @@ void RaftReplService::flush_durable_commit_lsn() { } ///////////////////// RaftReplService CP Callbacks ///////////////////////////// -std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } +int ReplSvcCPContext::add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx) { + m_cp_ctx_map.emplace(dev, dev_ctx); + return 0; +} + +cshared< ReplDevCPContext > ReplSvcCPContext::get_repl_dev_ctx(ReplDev* dev) { + if (m_cp_ctx_map.count(dev) == 0) { + // it is possible if a repl dev added during the cp flush + return std::make_shared< ReplDevCPContext >(); + } + return m_cp_ctx_map[dev]; +} + +std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { + // checking if cur_cp == nullptr as on_switchover_cp will be called when registering the cp handler + if (cur_cp != nullptr) { + // Add cp info from all devices to current cp. + // We dont need taking cp_guard as cp_mgr already taken it in do_trigger_cp_flush + auto cur_cp_ctx = s_cast< ReplSvcCPContext* >(cur_cp->context(cp_consumer_t::REPLICATION_SVC)); + repl_service().iterate_repl_devs([cur_cp, cur_cp_ctx](cshared< ReplDev >& repl_dev) { + // we need collecting the LSN of each repl dev and put it into current CP. + // There is no dirty buffers accumulated to new_cp yet, as the cp_mgr ensure replication_svc + // is the first one being called during cp switchover. + auto dev_ctx = std::static_pointer_cast< RaftReplDev >(repl_dev)->get_cp_ctx(cur_cp); + cur_cp_ctx->add_repl_dev_ctx(repl_dev.get(), std::move(dev_ctx)); + }); + } + // create new ctx + auto ctx = std::make_unique< ReplSvcCPContext >(new_cp); + return ctx; +} folly::Future< bool > RaftReplServiceCPHandler::cp_flush(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp); }); + auto cp_ctx = s_cast< ReplSvcCPContext* >(cp->context(cp_consumer_t::REPLICATION_SVC)); + repl_service().iterate_repl_devs([cp, cp_ctx](cshared< ReplDev >& repl_dev) { + auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get()); + std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp, dev_ctx); + }); return folly::makeFuture< bool >(true); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index cba90e2e0..4985d4eea 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -82,6 +82,23 @@ class RaftReplService : public GenericReplService, void flush_durable_commit_lsn(); }; +// cp context for repl_dev, repl_dev cp_lsn is critical cursor in the system, +// anything below the cp_lsn we believed is persisted through cp and will not +// go through replay. The cp_lsn need to be kept into ctx when switchover_cp, +// and the persist of repl_dev_cp need to be done after all other consumers succeed. + +struct ReplDevCPContext; + +class ReplSvcCPContext : public CPContext { + std::shared_mutex m_cp_map_mtx; + std::map< ReplDev*, cshared > m_cp_ctx_map; +public: + ReplSvcCPContext(CP* cp) : CPContext(cp){}; + virtual ~ReplSvcCPContext() = default; + int add_repl_dev_ctx(ReplDev* dev, cshared dev_ctx); + cshared get_repl_dev_ctx(ReplDev* dev); +}; + class RaftReplServiceCPHandler : public CPCallbacks { public: RaftReplServiceCPHandler() = default; From c4fcf700d55574ad464f36cdda7707c54f8d1099 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sat, 28 Sep 2024 01:13:35 +0800 Subject: [PATCH 003/130] Start data service after log replay done. Signed-off-by: Xiaoxi Chen --- .../replication/repl_dev/raft_repl_dev.cpp | 22 +++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 1 + .../replication/service/raft_repl_service.cpp | 20 +++++++++++++++-- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 088270de0..4db39382b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -74,6 +74,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); } m_rd_sb.write(); + bind_data_service(); } RD_LOG(INFO, @@ -83,9 +84,13 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), m_rd_sb->logdev_id, m_rd_sb->logstore_id); +} +bool RaftReplDev::bind_data_service() { + RD_LOG(INFO, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); + bool success = false; #ifdef _PRERELEASE - m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { + success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { RD_LOGI("Resuming after slow down data channel flip"); on_push_data_received(rpc_data); @@ -96,13 +101,22 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk } }); #else - m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); + success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); #endif - - m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + if (!success) { + RD_LOGE("Failed to bind data service request for PUSH_DATA"); + return false; + } + success = m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + if (!success) { + RD_LOGE("Failed to bind data service request for FETCH_DATA"); + return false; + } + return true; } bool RaftReplDev::join_group() { + bind_data_service(); auto raft_result = m_msg_mgr.join_group(m_group_id, "homestore_replication", std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(shared_from_this())); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e2e95550d..f78308aba 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -160,6 +160,7 @@ class RaftReplDev : public ReplDev, RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~RaftReplDev() = default; + bool bind_data_service(); bool join_group(); AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in); folly::SemiFuture< ReplServiceError > destroy_group(); diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 974984ca3..bd7cd1945 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -128,14 +128,30 @@ void RaftReplService::start() { m_config_sb_bufs.clear(); // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc - hs()->data_service().start(); + + // It is crucial to start the logstore before the enalbe data channel. This is because during log replay, + // the commit_blks() function is called, which interacts with the allocator. + // Starting the data channel before the log replay is complete can lead to a race condition between + // PUSHDATA operations and log replay. + // For example, consider LSN 100 in the log store is associated with PBA1. After a restart, the allocator + // is only aware of allocations up to the last checkpoint and may consider PBA1 as available. + // If a PUSHDATA request is received during this time, PBA1 could be allocated again to a new request, + // leading to data corruption by overwriting the data associated with LSN 100. + // Now the data channel is started in join_group(). + + LOGINFO("Starting LogStore service, fist_boot = {}", hs()->is_first_time_boot()); hs()->logstore_service().start(hs()->is_first_time_boot()); + LOGINFO("Started LogStore service, log replay should already done till this point"); + // all log stores are replayed, time to start data service. + LOGINFO("Starting DataService"); + hs()->data_service().start(); // Step 6: Iterate all the repl dev and ask each one of the join the raft group. for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); rdev->wait_for_logstore_ready(); if (!rdev->join_group()) { + HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); it = m_rd_map.erase(it); } else { ++it; @@ -358,7 +374,7 @@ void RaftReplService::start_reaper_thread() { m_rdev_gc_timer_hdl = iomanager.schedule_thread_timer( HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec) * 1000 * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { - LOGINFOMOD(replication, "Reaper Thread: Doing GC"); + LOGDEBUGMOD(replication, "Reaper Thread: Doing GC"); gc_repl_reqs(); gc_repl_devs(); }); From e223283daf303f32c69bc6dd25bea99331f01954 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sun, 29 Sep 2024 18:41:08 +0800 Subject: [PATCH 004/130] Flushing log after data written. Signed-off-by: Xiaoxi Chen --- src/lib/replication/log_store/repl_log_store.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 4271d8b88..36cec9370 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -66,17 +66,18 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and // the future will be ready. auto fut = m_rd.notify_after_data_written(reqs); + // Wait for the fetch and write to be completed successfully. + // It is essential to complete the data write before appending to the log. If the logs are flushed + // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost, + // it leaves us uncertain about whether the data was actually written, potentially leading to data inconsistency. + std::move(fut).wait(); - // In the meanwhile, we can flush the journal for this lsn batch. It is ok to flush the entries in log before - // actual data is written, because, even if we have the log, it doesn't mean data is committed, until state - // machine reports that. This way the flush and fetch both can run in parallel. + // Flushing log now. auto cur_time = std::chrono::steady_clock::now(); HomeRaftLogStore::end_of_append_batch(start_lsn, count); HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); cur_time = std::chrono::steady_clock::now(); - // Wait for the fetch and write to be completed successfully. - std::move(fut).wait(); HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); // Mark all the reqs also completely written From 2c853a319ffed7a147700fe7e3c9f95d5237b7e2 Mon Sep 17 00:00:00 2001 From: Sanal Date: Tue, 1 Oct 2024 17:15:05 -0700 Subject: [PATCH 005/130] Add raft commit quorum for replace member if two members down. (#559) --- src/include/homestore/replication_service.hpp | 4 +- .../replication/repl_dev/raft_repl_dev.cpp | 27 +++++++- src/lib/replication/repl_dev/raft_repl_dev.h | 3 +- .../replication/service/generic_repl_svc.cpp | 4 +- .../replication/service/generic_repl_svc.h | 4 +- .../replication/service/raft_repl_service.cpp | 6 +- .../replication/service/raft_repl_service.h | 4 +- src/tests/test_common/raft_repl_test_base.hpp | 9 ++- src/tests/test_raft_repl_dev_dynamic.cpp | 68 ++++++++++++++++++- 9 files changed, 109 insertions(+), 20 deletions(-) diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 8f535b855..f9b4f2986 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,8 +41,8 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const = 0; + virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 4db39382b..565bc0d67 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -127,23 +127,30 @@ bool RaftReplDev::join_group() { return true; } -AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid) { +AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid, + uint32_t commit_quorum) { LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), boost::uuids::to_string(member_out_uuid), boost::uuids::to_string(member_in_uuid)); + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum); + } + // Step 1: Check if leader itself is requested to move out. if (m_my_repl_id == member_out_uuid && m_my_repl_id == get_leader_id()) { // If leader is the member requested to move out, then give up leadership and return error. // Client will retry replace_member request to the new leader. raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); RD_LOGI("Replace member leader is the member_out so yield leadership"); + reset_quorum_size(0); return make_async_error<>(ReplServiceError::NOT_LEADER); } // Step 2. Add the new member. return m_msg_mgr.add_member(m_group_id, member_in_uuid) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in_uuid, member_out_uuid](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_in_uuid, member_out_uuid, commit_quorum](auto&& e) -> AsyncReplResult<> { // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout // when adding member. Member is added to cluster config until member syncs fully // with atleast stop gap. This will take a lot of time for block or @@ -157,6 +164,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl RD_LOGW("Ignoring error returned from nuraft add_member {}", e.error()); } else { RD_LOGE("Replace member error in add member : {}", e.error()); + reset_quorum_size(0); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } } @@ -179,6 +187,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { LOGERROR("Replace member propose to raft failed {}", err); + reset_quorum_size(0); return make_async_error<>(std::move(err)); } @@ -189,7 +198,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl // entry and call exit_group() and leave(). return m_msg_mgr.rem_member(m_group_id, member_out_uuid) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_out](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { if (e.hasError()) { // Ignore the server not found as server removed from the cluster // as requests are idempotent and can be resend. @@ -199,16 +208,28 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl // Its ok to retry this request as the request // of replace member is idempotent. RD_LOGE("Replace member failed to remove member : {}", e.error()); + reset_quorum_size(0); return make_async_error<>(ReplServiceError::RETRY_REQUEST); } } else { RD_LOGI("Replace member removed member={} from group_id={}", member_out, group_id_str()); } + + // Revert the quorum size back to 0. + reset_quorum_size(0); return make_async_success<>(); }); }); } +void RaftReplDev::reset_quorum_size(uint32_t commit_quorum) { + RD_LOGI("Reset raft quorum size={}", commit_quorum); + nuraft::raft_params params = raft_server()->get_current_params(); + params.with_custom_commit_quorum_size(commit_quorum); + params.with_custom_election_quorum_size(commit_quorum); + raft_server()->update_params(params); +} + folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // Set the intent to destroy the group m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYING; }); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index f78308aba..3b25cb23b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -162,7 +162,7 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in); + AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -283,6 +283,7 @@ class RaftReplDev : public ReplDev, void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); + void reset_quorum_size(uint32_t commit_quorum); }; } // namespace homestore diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 89800df3f..8e5c9a7a1 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -147,8 +147,8 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const { +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index e2d445427..5e0cb84a3 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -73,8 +73,8 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index bd7cd1945..d862c2098 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -346,13 +346,13 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const { +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum) const { auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->replace_member(member_out, member_in) + ->replace_member(member_out, member_in, commit_quorum) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { return make_async_error<>(e.error()); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 4985d4eea..44ed06332 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -69,8 +69,8 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 7b96afa4c..a3160f13a 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -610,11 +610,14 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in) { - this->run_on_leader(db, [this, db, member_out, member_in]() { + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) { + this->run_on_leader(db, [this, db, member_out, member_in, commit_quorum]() { LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); - auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), member_out, member_in).get(); + auto v = hs()->repl_service() + .replace_member(db->repl_dev()->group_id(), member_out, member_in, commit_quorum) + .get(); ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; }); } diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 7bd69a13c..c29f239e1 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -65,8 +65,73 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { LOGINFO("ReplaceMember test done"); } +TEST_F(ReplDevDynamicTest, TwoMemberDown) { + LOGINFO("TwoMemberDown test started"); + + // Make two members down in a group and leader cant reach a quorum. + // We set the custom quorum size to 1 and call replace member. + // Leader should do some writes to validate it has reach quorum size. + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + // Shutdown replica 1 and replica 2 to simulate two member down. + if (g_helper->replica_num() == 1) { + this->shutdown_replica(1); + LOGINFO("Shutdown replica 1"); + } + + if (g_helper->replica_num() == 2) { + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); + } + + if (g_helper->replica_num() == 0) { + // Replace down replica 2 with spare replica 3 with commit quorum 1 + // so that leader can go ahead with replacing member. + LOGINFO("Replace member started"); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + LOGINFO("Leader completed num_io={}", num_io_entries); + } + + if (g_helper->replica_num() == member_in) { + wait_for_commits(num_io_entries); + LOGINFO("Member in got all commits"); + } + + if (g_helper->replica_num() == 0 || g_helper->replica_num() == member_in) { + // Validate data on leader replica 0 and replica 3 + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_cleanup_start(num_members); + + if (g_helper->replica_num() == 1) { + LOGINFO("Start replica 1"); + this->start_replica(1); + } + if (g_helper->replica_num() == 2) { + LOGINFO("Start replica 2"); + this->start_replica(2); + } + + LOGINFO("TwoMemberDown test done"); +} + // TODO add more tests with leader and member restart, multiple member replace -// leader replace, commit quorum +// leader replace int main(int argc, char* argv[]) { int parsed_argc = argc; @@ -89,7 +154,6 @@ int main(int argc, char* argv[]) { // leadership_expiry time. // HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { - s.consensus.leadership_expiry_ms = -1; // -1 means never expires; s.generic.repl_dev_cleanup_interval_sec = 1; // Disable implicit flush and timer. From 67fe181172541dcd412ef68b5123f4478aac6649 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 14 Oct 2024 22:08:03 -0700 Subject: [PATCH 006/130] Add cert watcher --- .../replication/service/raft_repl_service.cpp | 47 +++++++++++++++++++ .../replication/service/raft_repl_service.h | 9 +++- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index d862c2098..8417b141c 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -85,6 +85,13 @@ void RaftReplService::start() { LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), params.mesg_port_); + //check if ssl cert files are provided, if yes, monitor the changes + if (!params.ssl_key_.empty() && !params.ssl_cert_.empty()) { + ioenvironment.with_file_watcher(); + monitor_cert_changes(); + } + + // Step 2: Register all RAFT parameters. At the end of this step, raft is ready to be created/join group auto r_params = nuraft::raft_params() .with_election_timeout_lower(HS_DYNAMIC_CONFIG(consensus.elect_to_low_ms)) @@ -175,6 +182,46 @@ void RaftReplService::stop() { hs()->logstore_service().stop(); } +void RaftReplService::monitor_cert_changes() { + auto fw = ioenvironment.get_file_watcher(); + auto cert_change_cb = [this](const std::string filepath, const bool deleted) { + LOGINFO("file change event for {}, deleted? {}", filepath, deleted) + // do not block file_watcher thread + std::thread restart_svc(&RaftReplService::restart_raft_svc, this, filepath, deleted); + restart_svc.detach(); + }; + + //monitor ssl cert file + if (!fw->register_listener(ioenvironment.get_ssl_cert(), "hs_ssl_cert_watcher", cert_change_cb)) { + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", + "hs_ssl_cert_watcher", ioenvironment.get_ssl_cert()); + } + //monitor ssl key file + if (!fw->register_listener(ioenvironment.get_ssl_key(), "hs_ssl_key_watcher", cert_change_cb)) { + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", + "hs_ssl_key_watcher", ioenvironment.get_ssl_key()); + } +} + +void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted){ + if (deleted && !wait_for_cert(filepath)) { + LOGINFO("file {} deleted, ", filepath) + // wait for the deleted file to be added again + throw std::runtime_error(fmt::format("file {} not found! Can not start grpc server", filepath)); + } + const std::unique_lock lock(raft_restart_mutex); + m_msg_mgr->restart_server(); + if (deleted) { monitor_cert_changes(); } +} + +bool RaftReplService::wait_for_cert(const std::string& filepath) { + for (auto i = cert_change_timeout; i > 0; --i) { + if (std::filesystem::exists(filepath)) { return true; } + std::this_thread::sleep_for(cert_check_sleep); + } + return false; +} + RaftReplDev* RaftReplService::raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie) { json_superblk group_config; auto& js = group_config.load(buf, meta_cookie); diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 44ed06332..4daaad9b3 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -31,6 +31,9 @@ namespace homestore { +constexpr auto cert_change_timeout = 1200; +constexpr auto cert_check_sleep = std::chrono::seconds(1); + struct repl_dev_superblk; class RaftReplDev; @@ -47,7 +50,8 @@ class RaftReplService : public GenericReplService, iomgr::timer_handle_t m_rdev_gc_timer_hdl; iomgr::timer_handle_t m_flush_durable_commit_timer_hdl; iomgr::io_fiber_t m_reaper_fiber; - + std::mutex raft_restart_mutex; + public: RaftReplService(cshared< ReplApplication >& repl_app); @@ -80,6 +84,9 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); + void monitor_cert_changes(); + void restart_raft_svc(const std::string filepath, const bool deleted); + bool wait_for_cert(const std::string& filepath); }; // cp context for repl_dev, repl_dev cp_lsn is critical cursor in the system, From 5e9fe1dc824bd72d329b02fb65fd11ed1a70ecd0 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Tue, 15 Oct 2024 22:43:59 -0700 Subject: [PATCH 007/130] fix nit --- src/lib/replication/service/raft_repl_service.cpp | 3 ++- src/lib/replication/service/raft_repl_service.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 8417b141c..c4aefe1ca 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -215,7 +215,8 @@ void RaftReplService::restart_raft_svc(const std::string filepath, const bool de } bool RaftReplService::wait_for_cert(const std::string& filepath) { - for (auto i = cert_change_timeout; i > 0; --i) { + auto attempts = cert_change_timeout/cert_check_sleep; + for (auto i = attempts; i > 0; --i) { if (std::filesystem::exists(filepath)) { return true; } std::this_thread::sleep_for(cert_check_sleep); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 4daaad9b3..e0d1e6718 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -31,7 +31,7 @@ namespace homestore { -constexpr auto cert_change_timeout = 1200; +constexpr auto cert_change_timeout = std::chrono::seconds(1200); constexpr auto cert_check_sleep = std::chrono::seconds(1); struct repl_dev_superblk; From d331d32bc23705b5f55e0f87c0462bebe951186f Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 17 Oct 2024 21:36:28 -0700 Subject: [PATCH 008/130] Fix read_io in dataservice test. Previous code can overflow the io_size, i.e remaining_io_size -= sub_io_size; where sub_io_size > remaining_io_size, and remaining_io_size is unsigned which will be a huge number, takes ages to finish. Signed-off-by: Xiaoxi Chen --- src/tests/test_data_service.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index 0974ca431..e6c47e211 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -445,7 +445,7 @@ class BlkDataServiceTest : public testing::Test { void read_io(uint32_t io_size) { auto remaining_io_size = io_size; while (remaining_io_size > 0) { - auto const bid = get_rand_blkid_to_read(io_size); + auto const bid = get_rand_blkid_to_read(remaining_io_size); if (!bid.is_valid()) { // didn't find any block to read, either write blk map is empty or // all blks are pending on free. @@ -455,6 +455,7 @@ class BlkDataServiceTest : public testing::Test { // every piece in bid is a single block, e.g. nblks = 1 auto const nbids = bid.num_pieces(); auto sub_io_size = nbids * inst().get_blk_size(); + HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size"); // we pass crc from lambda becaues if there is any async_free_blk, the written blks in the blkcrc map will // be removed by the time read thenVlue is called; @@ -581,7 +582,7 @@ class BlkDataServiceTest : public testing::Test { auto nbids = io_size / inst().get_blk_size(); // number of blks to read; // nbids should not exceed max pieces that MultiBlkId can hold; - nbids = std::max(nbids, MultiBlkId::max_addln_pieces); + nbids = std::min(nbids, MultiBlkId::max_addln_pieces); // make sure skip + nbids are in the range of m_blk_crc_map; if (skip_nbids + nbids > m_blk_crc_map.size()) { skip_nbids = m_blk_crc_map.size() - nbids; } From 90fd1a077bc7f1192cf04a8df18e4c73996bcfb2 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 21 Oct 2024 17:45:22 -0700 Subject: [PATCH 009/130] FIX wbcache for put and modify long running index (#567) --- .../btree/detail/btree_mutate_impl.ipp | 5 + src/lib/index/inplace_btree/index_cp.hpp | 5 +- .../index/inplace_btree/inplace_btree_store.h | 11 +- src/lib/index/inplace_btree/wb_cache.cpp | 169 ++++++--- src/lib/index/inplace_btree/wb_cache.hpp | 2 +- src/tests/btree_helpers/shadow_map.hpp | 1 + src/tests/test_index_crash_recovery.cpp | 339 ++++++++++++++---- src/tests/test_mem_btree.cpp | 2 + src/tests/test_scripts/index_test.py | 17 +- 9 files changed, 417 insertions(+), 134 deletions(-) diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 0df733575..0a8f57686 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -357,6 +357,11 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const child_node1->inc_link_version(); // Update the existing parent node entry to point to second child ptr. + // Don't change the order. First update the parent node and then insert the new key. This is important for casee + // where the split key is the last key in the parent node. In this case, the split key should be inserted in the + // parent node. If we insert the split key first, then the split key will be inserted in the parent node and the + // last key in the parent node will be lost. This will lead to inconsistency in the tree. In case of empty parent + // (i.e., new root) or updating the edge, this order made sure that edge is updated. parent_node->update(parent_ind, child_node2->link_info()); parent_node->insert(parent_ind, *out_split_key, child_node1->link_info()); diff --git a/src/lib/index/inplace_btree/index_cp.hpp b/src/lib/index/inplace_btree/index_cp.hpp index c8292c47f..b04b8f052 100644 --- a/src/lib/index/inplace_btree/index_cp.hpp +++ b/src/lib/index/inplace_btree/index_cp.hpp @@ -92,12 +92,12 @@ struct IndexCPContext : public VDevCPContext { } std::string parent_id_string() const { - return (has_inplace_parent == 0x1) ? fmt::format("chunk={}, blk={}", ids[0].second, ids[0].first) : "empty"; + return (has_inplace_parent == 0x1) ? fmt::format("{}", blk_id(0).to_integer()) : "empty"; } std::string child_id_string() const { auto const idx = (has_inplace_parent == 0x1) ? 1 : 0; - return (has_inplace_child == 0x1) ? fmt::format("chunk={}, blk={}", ids[idx].second, ids[idx].first) + return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) : "empty"; } @@ -160,6 +160,7 @@ struct IndexCPContext : public VDevCPContext { std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); std::string to_string_with_dags(); + uint16_t num_dags(); void to_string_dot(const std::string& filename); private: diff --git a/src/lib/index/inplace_btree/inplace_btree_store.h b/src/lib/index/inplace_btree/inplace_btree_store.h index 4552c2516..484901fa3 100644 --- a/src/lib/index/inplace_btree/inplace_btree_store.h +++ b/src/lib/index/inplace_btree/inplace_btree_store.h @@ -247,10 +247,13 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ + // return btree_status_t::success;} m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); if (!wb_cache().refresh_meta_buf(m_sb_buffer, r_cast< CPContext* >(context))) { + LOGTRACEMOD(wbcache, "CP mismatch error - discard transact for meta node"); return btree_status_t::cp_mismatch; } @@ -261,8 +264,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); - - // Get the last key in the node + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs + // to be handled. Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); auto const is_parent_edge_node = parent_node->has_valid_edge(); if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { @@ -308,8 +311,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), child_node->to_string(), child_last_key.to_string()); - if (child_last_key.compare(last_parent_key) > 0) { - // We have reached the last key, we can stop now + if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) { + // We have reached the last key, and the parent node doesn't have edge, so we can stop now break; } diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index 899d7475a..f75f4e63a 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -194,14 +194,19 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte return false; // meta_buf modified by a newer CP, we shouldn't overwrite that } else if (meta_buf->m_dirtied_cp_id == cp_ctx->id()) { // Modified by the same cp, no need to create new index buffer, but we only copy the superblk to the buffer + LOGTRACEMOD(wbcache, "meta buf {} is already dirtied in cp {} now is in recovery {}", meta_buf->to_string(), + cp_ctx->id(), m_in_recovery); meta_buf->copy_sb_to_buf(); + // TODO: corner case , meta buffer is dirtied by the same cp but not added to dirty list due to previously + // recovery mode } else { // We always create a new meta index buffer on every meta buf update, which copies the superblk auto new_buf = std::make_shared< MetaIndexBuffer >(meta_buf); new_buf->m_dirtied_cp_id = cp_ctx->id(); write_buf(nullptr, new_buf, cp_ctx); meta_buf = new_buf; // Replace the meta_buf with new buf - LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id()); + LOGTRACEMOD(wbcache, "meta buf {} is created in cp {} in recovery = {}", meta_buf->to_string(), cp_ctx->id(), + m_in_recovery); } return true; } @@ -290,10 +295,49 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p freed_node_bufs // free_node_bufs ); } +#ifdef _PRERELEASE + // log new nodes and freed nodes and parent and child + static uint32_t txn_id = 0; + static int last_cp_id = -2; + static std::string txn = ""; + if (last_cp_id != icp_ctx->id()) { + last_cp_id = icp_ctx->id(); + txn_id = 0; + txn = ""; + } + + if (new_node_bufs.empty() && freed_node_bufs.empty()) { + fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, + (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty", + child_buf->blkid().to_integer(), "empty", "empty"); + } else { + std::string new_nodes; + for (auto const& buf : new_node_bufs) { + new_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string freed_nodes; + for (auto const& buf : freed_node_bufs) { + freed_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string parent_str = (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty"; + std::string child_str = (child_buf && child_buf->blkid().to_integer() != 0) + ? std::to_string(child_buf->blkid().to_integer()) + : "empty"; + + fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, + child_str, new_nodes, freed_nodes); + } + LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn); + txn_id++; +#endif #if 0 static int id = 0; - auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot"; - LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename); + auto filename = fmt::format("txn_buf_{}_{}.dot", icp_ctx->id(), id++); + LOGTRACEMOD(wbcache,"Writing txn to file: {}", filename); icp_ctx->to_string_dot(filename); #endif } @@ -388,6 +432,14 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { } //////////////////// Recovery Related section ///////////////////////////////// +void IndexWBCache::load_buf(IndexBufferPtr const& buf) { + if (buf->m_bytes == nullptr) { + buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); + m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); + buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); + } +} + void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -406,6 +458,29 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(), bufs.size()); +#ifdef _PRERELEASE + auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, + std::vector< IndexBufferPtr > const& l0_bufs) { + std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); + for (auto const& [_, buf] : bufs) { + load_buf(buf); + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + + // list of new_bufs + if (!l0_bufs.empty()) { + fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); + for (auto const& buf : l0_bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + } + return log; + }; + + std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); + LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); +#endif + // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one // addition of all freed buffers also put in the DAG structure. // @@ -431,30 +506,30 @@ void IndexWBCache::recover(sisl::byte_view sb) { l0_bufs.push_back(buf); } else { buf->m_up_buffer->m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); + it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + auto sp = it->lock(); + if (sp && sp == buf) { + found = true; + buf->m_up_buffer->m_down_buffers.erase(it); + break; + } + } + HS_DBG_ASSERT(found, + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif } } } } +#ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", l0_bufs.size(), bufs.size(), icp_ctx->id()); - - auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { - // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs - std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const& [_, buf] : bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); - } - - // list of new_bufs - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); - } - return log; - }; LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); +#endif // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in @@ -467,7 +542,10 @@ void IndexWBCache::recover(sisl::byte_view sb) { } void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { - if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; } + if (!buf->m_wait_for_down_buffers.decrement_testz()) { + // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer + return; + } // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed, // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if @@ -493,21 +571,21 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { } // All down_buf has indicated that they have seen this up buffer, now its time to repair them. - if (buf->m_bytes == nullptr) { - // Read the btree node and get its modified cp_id - buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); - m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); - if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } - - buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); - } - auto cpg = cp_mgr().cp_guard(); - return (buf->m_dirtied_cp_id == cpg->id()); + load_buf(buf); + if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } + return (buf->m_dirtied_cp_id == cp_mgr().cp_guard()->id()); } //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(), + cp_ctx->to_string()); + // #ifdef _PRERELEASE + // static int id = 0; + // auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot"; + // LOGTRACEMOD(wbcache, "Transact cp storing in file {}\n\n\n", filename); + // cp_ctx->to_string_dot(filename); + // #endif if (!cp_ctx->any_dirty_buffers()) { if (cp_ctx->id() == 0) { // For the first CP, we need to flush the journal buffer to the meta blk @@ -521,17 +599,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { #ifdef _PRERELEASE if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush"); + LOGINFO("crash simulation is ongoing, so skip the cp flush"); return folly::makeFuture< bool >(true); } #endif - // First thing is to flush the new_blks created as part of the CP. + // First thing is to flush the journal created as part of the CP. auto const& journal_buf = cp_ctx->journal_buf(); + auto txn = r_cast< IndexCPContext::txn_journal const* >(journal_buf.cbytes()); if (journal_buf.size() != 0) { if (m_meta_blk) { + LOGTRACEMOD(wbcache, " journal {} ", txn->to_string()); meta_service().update_sub_sb(journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } else { + LOGTRACEMOD(wbcache, " First time journal {} ", txn->to_string()); meta_service().add_sub_sb("wb_cache", journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } } @@ -554,21 +635,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE + static std::once_flag flag; if (buf->m_crash_flag_on) { -// std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; -// LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); -// cp_ctx->to_string_dot(filename); - LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string()); + std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; + LOGINFO("Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); + // cp_ctx->to_string_dot(filename); hs()->crash_simulator().crash(); cp_ctx->complete(true); return; } else if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); + std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); return; } #endif - LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string()); buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { @@ -582,16 +662,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(), - BtreeNode::to_string_buf(buf->raw_buffer())); + LOGTRACEMOD(wbcache, "flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { - LOGERROR("Failed to access write-back cache: {}", e.what()); - } + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } }); if (!part_of_batch) { m_vdev->submit_batch(); } @@ -600,8 +677,10 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) { #ifdef _PRERELEASE + static std::once_flag flag; if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); + std::call_once( + flag, []() { LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); }); return; } #endif diff --git a/src/lib/index/inplace_btree/wb_cache.hpp b/src/lib/index/inplace_btree/wb_cache.hpp index 209d3845e..25a4c8201 100644 --- a/src/lib/index/inplace_btree/wb_cache.hpp +++ b/src/lib/index/inplace_btree/wb_cache.hpp @@ -41,7 +41,6 @@ class IndexWBCache : public IndexWBCacheBase { std::mutex m_flush_mtx; void* m_meta_blk; bool m_in_recovery{false}; - public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); @@ -78,5 +77,6 @@ class IndexWBCache : public IndexWBCacheBase { void recover_buf(IndexBufferPtr const& buf); bool was_node_committed(IndexBufferPtr const& buf); + void load_buf(IndexBufferPtr const& buf); }; } // namespace homestore diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index 8aae946d3..7d2070e04 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -242,6 +242,7 @@ class ShadowMap { file << key.key() << " " << value << '\n'; } file.close(); + LOGINFO("Saved shadow map to file: {}", filename); } void load(const std::string& filename) { diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 77fdfb651..6143fd242 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -33,25 +33,30 @@ SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup // TODO Add tests to do write,remove after recovery. // TODO Test with var len key with io mgr page size is 512. -SISL_OPTION_GROUP(test_index_crash_recovery, - (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), - (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), - "seconds"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value< uint32_t >()->default_value("0"), ""), - (operation_list, "", "operation_list", - "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), - (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), - (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), - (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), - (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) +SISL_OPTION_GROUP( + test_index_crash_recovery, + (num_iters, "", "num_iters", "number of iterations for rand ops", + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + (num_entries, "", "num_entries", "number of entries to test with", + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + (num_rounds, "", "num_rounds", "number of rounds to test with", + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", + ::cxxopts::value< uint32_t >()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), + ""), + (operation_list, "", "operation_list", "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + (preload_size, "", "preload_size", "number of entries to preload tree with", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), + (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), + (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), + (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", + ::cxxopts::value< bool >()->default_value("1"), ""), + (seed, "", "seed", "random engine seed, use random if not defined", + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -73,8 +78,6 @@ class SequenceGenerator { public: SequenceGenerator(int putFreq, int removeFreq, uint64_t start_range, uint64_t end_range) : putFreq_(putFreq), removeFreq_(removeFreq), start_range_(start_range), end_range_(end_range) { - std::random_device rd; - gen_ = std::mt19937(rd()); keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); updateOperationTypeDistribution(); } @@ -99,11 +102,11 @@ class SequenceGenerator { std::vector< Operation > operations; if (reset) { this->reset(); } for (size_t i = 0; i < numOperations; ++i) { - uint32_t key = keyDist_(gen_); + uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; - OperationType operation = static_cast< OperationType >(opTypeDist_(gen_)); + OperationType operation = static_cast< OperationType >(opTypeDist_(g_re)); if (operation == OperationType::Put && !inUse) { operations.emplace_back(key, OperationType::Put); @@ -130,15 +133,16 @@ class SequenceGenerator { } return occurrences; } - __attribute__((noinline)) std::string printOperations(const OperationList& operations) const { + __attribute__((noinline)) static std::string printOperations(const OperationList& operations) { std::ostringstream oss; + auto count = 1; for (const auto& [key, opType] : operations) { std::string opTypeStr = (opType == OperationType::Put) ? "Put" : "Remove"; - oss << "{" << key << ", " << opTypeStr << "}\n"; + oss << count++ << "- {" << key << ", " << opTypeStr << "}\n"; } return oss.str(); } - __attribute__((noinline)) std::string printKeysOccurrences(const OperationList& operations) const { + __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; for (auto key : keys) { @@ -151,16 +155,51 @@ class SequenceGenerator { } return oss.str(); } - __attribute__((noinline)) std::string printKeyOccurrences(const OperationList& operations, uint64_t key ) const { + __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); oss << "Occurrences of key " << key << ":\n"; for (const auto& [index, operation] : keyOccurrences) { std::string opTypeStr = (operation == OperationType::Put) ? "Put" : "Remove"; - oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; + oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; } return oss.str(); } + + static std::set< uint64_t > collectUniqueKeys(const OperationList& operations) { + std::set< uint64_t > keys; + for (const auto& [key, _] : operations) { + keys.insert(key); + } + return keys; + } + static void save_to_file(std::string filename, const OperationList& operations) { + std::ofstream file(filename); + if (file.is_open()) { + for (const auto& [key, opType] : operations) { + file << key << " " << static_cast< int >(opType) << "\n"; + } + file.close(); + } + } + + static OperationList load_from_file(std::string filename) { + std::ifstream file(filename); + OperationList operations; + if (file.is_open()) { + std::string line; + while (std::getline(file, line)) { + std::istringstream iss(line); + uint64_t key; + int opType; + iss >> key >> opType; + operations.emplace_back(key, static_cast< OperationType >(opType)); + } + file.close(); + } + return operations; + } + void reset() { keyStates.clear(); } private: @@ -168,7 +207,6 @@ class SequenceGenerator { int removeFreq_; uint64_t start_range_; uint64_t end_range_; - std::mt19937 gen_; std::uniform_int_distribution<> keyDist_; std::discrete_distribution<> opTypeDist_; std::map< uint64_t, bool > keyStates; @@ -177,15 +215,8 @@ class SequenceGenerator { opTypeDist_ = std::discrete_distribution<>({static_cast< double >(putFreq_), static_cast< double >(removeFreq_)}); } - - std::set< uint64_t > collectUniqueKeys(const OperationList& operations) const { - std::set< uint64_t > keys; - for (const auto& [key, _] : operations) { - keys.insert(key); - } - return keys; - } }; + #ifdef _PRERELEASE template < typename TestType > struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test { @@ -197,7 +228,9 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT TestIndexServiceCallbacks(IndexCrashTest* test) : m_test(test) {} std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { - LOGINFO("Index table recovered, root bnode_id {} version {}", sb->root_node, sb->root_link_version); + LOGINFO("Index table recovered, root bnode_id {} uuid {} ordinal {} version {}", + static_cast< uint64_t >(sb->root_node), boost::uuids::to_string(sb->uuid), sb->ordinal, + sb->root_link_version); m_test->m_cfg = BtreeConfig(hs()->index_service().node_size()); m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; @@ -240,18 +273,29 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT BtreeTestHelper< TestType >::SetUp(); if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) { this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + // LOGINFO("Creating new index table with uuid {} - init_device:{:s} bt: {} root id {}, num of + // keys {}", boost::uuids::to_string(uuid), SISL_OPTIONS["init_device"].as< bool >(), + // this->m_bt, this->m_bt->root_node_id(), num_keys); + LOGINFO("Creating new index table with uuid {} - root id {}, num of keys {}", boost::uuids::to_string(uuid), + this->m_bt->root_node_id(), num_keys); + } else { populate_shadow_map(); } hs()->index_service().add_index_table(this->m_bt); - LOGINFO("Added index table to index service"); + LOGINFO("Added index table to index service with uuid {} - total tables in the system is currently {}", + boost::uuids::to_string(uuid), hs()->index_service().num_tables()); } void populate_shadow_map() { + LOGINFO("Populating shadow map"); this->m_shadow_map.load(m_shadow_filename); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + LOGINFO("Shadow map size {} - btree keys {} - root id {}", this->m_shadow_map.size(), num_keys, + this->m_bt->root_node_id()); + ASSERT_EQ(this->m_shadow_map.size(), num_keys) << "shadow map size and tree size mismatch"; this->get_all(); } @@ -262,6 +306,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); hs()->index_service().add_index_table(this->m_bt); this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(m_shadow_filename); + LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); } void restart_homestore(uint32_t shutdown_delay_sec = 3) override { @@ -273,7 +319,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void reapply_after_crash() { ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; snapshot_map.load(m_shadow_filename); - LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string()); + LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); auto diff = this->m_shadow_map.diff(snapshot_map); // visualize tree after crash @@ -285,7 +331,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT for (const auto& [k, addition] : diff) { dif_str += fmt::format(" {} \t{}\n", k.key(), addition); } - LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str); + LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); for (const auto& [k, addition] : diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); @@ -323,8 +369,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Error: failed to remove {}", m_shadow_filename); } } - LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), - this->m_bt->count_keys(this->m_bt->root_node_id())); + LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), this->tree_key_count()); BtreeTestHelper< TestType >::TearDown(); this->shutdown_homestore(false); } @@ -340,30 +385,67 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch"; + } + + void sanity_check(OperationList& operations) const { + std::set< uint64_t > new_keys; + std::transform(operations.begin(), operations.end(), std::inserter(new_keys, new_keys.end()), + [](const Operation& operation) { return operation.first; }); + uint32_t count = 1; + this->m_shadow_map.foreach ([this, new_keys, &count](K key, V value) { + // discard the new keys to check + if (new_keys.find(key.key()) != new_keys.end()) { return; } + auto copy_key = std::make_unique< K >(); + *copy_key = key; + auto out_v = std::make_unique< V >(); + auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; + req.enable_route_tracing(); + const auto ret = this->m_bt->get(req); + ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; + LOGINFO("{} - Key {} passed sanity check!", count++, key.key()); + }); } void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); + + if (!filename.empty()) { + std::string b_filename = filename + "_before_crash.dot"; + LOGINFO("Visualize the tree before crash file {}", b_filename); + this->visualize_keys(b_filename); + } + test_common::HSTestHelper::trigger_cp(false); + LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(); - // this->print_keys("Post crash and recovery, btree structure:"); if (!filename.empty()) { - LOGINFO("Visualize the tree file {}", filename); - this->visualize_keys(filename); + std::string rec_filename = filename + "_after_recovery.dot"; + LOGINFO("Visualize the tree file after recovery : {}", rec_filename); + this->visualize_keys(rec_filename); + this->print_keys("Post crash and recovery, btree structure: "); } - + sanity_check(operations); + // Added to the index service right after recovery. Not needed here + // test_common::HSTestHelper::trigger_cp(true); + LOGINFO("Before Reapply: {} keys in shadow map and actually {} in trees operation size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); this->reapply_after_crash(operations); - - // this->print_keys("\n\nafter reapply keys"); if (!filename.empty()) { - LOGINFO("Visualize the tree file after_reapply__{}", filename); - this->visualize_keys("after_reapply__" + filename); + std::string re_filename = filename + "_after_reapply.dot"; + LOGINFO("Visualize the tree after reapply {}", re_filename); + this->visualize_keys(re_filename); + this->print_keys("Post crash and recovery, btree structure: "); } this->get_all(); + LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), + tree_key_count()); + ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) + << "shadow map size and tree size mismatch"; } uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); } @@ -377,6 +459,8 @@ using BtreeTypes = testing::Types< FixedLenBtree >; TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes); TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Simulate the crash even before first cp this->set_basic_flip("crash_flush_on_root"); @@ -392,6 +476,8 @@ TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { } TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Insert into 4 phases, first fill up the last part, since we need to test split on left edge LOGINFO("Step 1: Fill up the last quarter of the tree"); auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -524,11 +610,11 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", "crash_flush_on_split_at_right_child"}; OperationList operations; + bool renew_btree_after_crash = true; for (size_t i = 0; i < flips.size(); ++i) { - this->reset_btree(); LOGINFO("Step 1-{}: Set flag {}", i + 1, flips[i]); this->set_basic_flip(flips[i]); - operations = generator.generateOperations(num_entries -1 , true /* reset */); + operations = generator.generateOperations(num_entries - 1, renew_btree_after_crash /* reset */); // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, // generator.printKeyOccurrences(operations)); @@ -537,49 +623,148 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { this->put(k, btree_put_type::INSERT, true /* expect_success */); } this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + if (renew_btree_after_crash) { this->reset_btree(); }; } } TYPED_TEST(IndexCrashTest, long_running_put_crash) { + // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + auto const preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); + auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); + auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); + bool load_mode = SISL_OPTIONS.count("load_from_file"); + bool save_mode = SISL_OPTIONS.count("save_to_file"); SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", "crash_flush_on_split_at_right_child"}; + + std::string flip = ""; OperationList operations; auto m_start_time = Clock::now(); auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; double elapsed_time, progress_percent, last_progress_time = 0; - for (size_t i = 0; !time_to_stop(); ++i) { - bool print_time = false; - elapsed_time = get_elapsed_time_sec(m_start_time); + bool renew_btree_after_crash = false; + auto cur_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", preload_size); + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(preload_size, true /* reset */); + if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + } + auto opstr = SequenceGenerator::printOperations(operations); + LOGINFO("Lets before crash print operations\n{}", opstr); - this->reset_btree(); - auto flip = flips[i % flips.size()]; - LOGINFO("Step 1-{}: Set flag {}", i + 1, flip); + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } - this->set_basic_flip(flip, 1, 10); - operations = generator.generateOperations(num_entries -1, true /* reset */); - // operations = generator.generateOperations(num_entries/10, false /* reset */); - // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); - // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, - // generator.printKeyOccurrences(operations)); + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; + round <= rounds && !time_to_stop() && this->tree_key_count() < num_entries - num_entries_per_rounds; round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + if (load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + flip = line; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + flip = flips[cur_flip_idx++ % flips.size()]; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } + file.close(); + } + } + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); + if (save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); for (auto [k, _] : operations) { - // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); this->put(k, btree_put_type::INSERT, true /* expect_success */); + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); } - this->crash_and_recover(operations/*, fmt::format("recover_tree_crash_{}.dot", i + 1)*/); if (elapsed_time - last_progress_time > 30) { last_progress_time = elapsed_time; print_time = true; } if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} iterations completed - Elapsed time: {:.0f} seconds of total " - "{} ({:.2f}%)\n\n\n", - i, elapsed_time, this->m_run_time, elapsed_time * 100.0 / this->m_run_time); + LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, + elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, + this->tree_key_count() * 100.0 / num_entries); } - this->print_keys(fmt::format("reapply: after iteration {}", i)); - + this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; } } #endif diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 45681f412..af50c12c2 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -46,6 +46,8 @@ SISL_OPTION_GROUP( ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", + ::cxxopts::value< uint32_t >()->default_value("0"), ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 4e4814ccb..02c3e4c2c 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -20,11 +20,13 @@ def run_test(options, type): raise TestFailedError(f"Test failed for type {type}") print("Test completed") + def run_crash_test(options): - cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --max_keys_in_node={options['max_keys_in_node']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} {options['dev_list']}" + cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " # print(f"Running test with options: {cmd_opts}") try: - subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True) + subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, + shell=True) except subprocess.CalledProcessError as e: print(f"Test failed: {e}") raise TestFailedError(f"Test failed for type {type}") @@ -49,7 +51,9 @@ def parse_arguments(): parser.add_argument('--dev_list', help='Device list', default='') parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) - parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=20) + parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) + parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60) # Parse the known arguments and ignore any unknown arguments args, unknown = parser.parse_known_args() @@ -73,7 +77,6 @@ def long_runnig_index(options, type=0): def long_running_clean_shutdown(options, type=0): print("Long running clean shutdown started") - options['run_time'] = int(options['run_time']) // 10 # 20 minutes try: run_test(options, type) @@ -87,14 +90,18 @@ def long_running_clean_shutdown(options, type=0): raise print("Long running clean shutdown completed") + def long_running_crash_put(options): print("Long running crash put started") - options['num_entries'] = 20480 # 20K + options['num_entries'] = 131072 # 128K options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['preload_size'] = 100 print(f"options: {options}") run_crash_test(options) print("Long running crash put completed") + def main(): options = parse_arguments() test_suite_name = options['test_suits'] From 69e621c74c3c3397ae7b2b32af2f65cecb9eeb71 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Mon, 21 Oct 2024 15:04:05 +0800 Subject: [PATCH 010/130] Count in ovf headers. We see no space error in write_to_full ut, might due to when left space == max_wrt_sz and we take max_wrt_sz, however two extra blks are needed. Signed-off-by: Xiaoxi Chen --- src/tests/test_meta_blk_mgr.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index 870dd5191..8d47cb24a 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -193,7 +193,8 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t size_written{0}; while (free_size > 0) { - if (free_size >= gp.max_wrt_sz) { + // if it is overflow, 2 extra blocks are needed for ovf blk header and meta blk; + if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) { size_written = do_sb_write(do_overflow(), 0); } else { size_written = do_sb_write(false, m_mbm->meta_blk_context_sz()); From e6cb8ea41a26e84c37aa4b7ae9452806c2fb2241 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:24:14 -0700 Subject: [PATCH 011/130] Reduce logs (#571) --- src/tests/test_index_crash_recovery.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 6143fd242..9eefa486b 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -403,8 +403,9 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT req.enable_route_tracing(); const auto ret = this->m_bt->get(req); ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; - LOGINFO("{} - Key {} passed sanity check!", count++, key.key()); }); + LOGINFO("Sanity check passed for {} keys!", count); + } void crash_and_recover(OperationList& operations, std::string filename = "") { @@ -438,7 +439,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string re_filename = filename + "_after_reapply.dot"; LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); - this->print_keys("Post crash and recovery, btree structure: "); +// this->print_keys("Post crash and recovery, btree structure: "); } this->get_all(); From 6756e951d5dbe6e69fa50c78df477e5ee99c8b15 Mon Sep 17 00:00:00 2001 From: Sanal P Date: Mon, 21 Oct 2024 13:51:28 -0700 Subject: [PATCH 012/130] Change replace member api signature. Add replica member info with name, priority and id. Use replica member info for replace member api and listener callbacks. --- .../homestore/replication/repl_decls.h | 7 ++ src/include/homestore/replication/repl_dev.h | 2 +- src/include/homestore/replication_service.hpp | 3 +- .../replication/repl_dev/raft_repl_dev.cpp | 81 ++++++++++--------- src/lib/replication/repl_dev/raft_repl_dev.h | 11 +-- .../replication/service/generic_repl_svc.cpp | 4 +- .../replication/service/generic_repl_svc.h | 4 +- .../replication/service/raft_repl_service.cpp | 25 +++--- .../replication/service/raft_repl_service.h | 15 ++-- src/tests/test_common/raft_repl_test_base.hpp | 11 ++- src/tests/test_solo_repl_dev.cpp | 2 +- 11 files changed, 89 insertions(+), 76 deletions(-) diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 558c19517..192a418bc 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -74,6 +74,13 @@ struct peer_info { uint64_t last_succ_resp_us_; }; +struct replica_member_info { + static constexpr uint64_t max_name_len = 128; + replica_id_t id; + char name[max_name_len]; + int32_t priority{0}; +}; + } // namespace homestore // hash function definitions diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 15dc4872a..c2223455f 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -348,7 +348,7 @@ class ReplDevListener { virtual void on_destroy() = 0; /// @brief Called when replace member is performed. - virtual void replace_member(replica_id_t member_out, replica_id_t member_in) = 0; + virtual void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) = 0; /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index f9b4f2986..c3e56d9a3 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,7 +41,8 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 565bc0d67..f3a4a2461 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -74,7 +74,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); } m_rd_sb.write(); - bind_data_service(); + bind_data_service(); } RD_LOG(INFO, @@ -90,27 +90,30 @@ bool RaftReplDev::bind_data_service() { RD_LOG(INFO, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); bool success = false; #ifdef _PRERELEASE - success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { - if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { - RD_LOGI("Resuming after slow down data channel flip"); + success = + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { + if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { + RD_LOGI("Resuming after slow down data channel flip"); + on_push_data_received(rpc_data); + })) { + RD_LOGI("Slow down data channel flip is enabled, scheduling to call later"); + } else { on_push_data_received(rpc_data); - })) { - RD_LOGI("Slow down data channel flip is enabled, scheduling to call later"); - } else { - on_push_data_received(rpc_data); - } - }); + } + }); #else - success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); + success = + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); #endif if (!success) { RD_LOGE("Failed to bind data service request for PUSH_DATA"); - return false; + return false; } - success = m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + success = + m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); if (!success) { RD_LOGE("Failed to bind data service request for FETCH_DATA"); - return false; + return false; } return true; } @@ -127,10 +130,10 @@ bool RaftReplDev::join_group() { return true; } -AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid, - uint32_t commit_quorum) { +AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum) { LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), - boost::uuids::to_string(member_out_uuid), boost::uuids::to_string(member_in_uuid)); + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { // Two members are down and leader cant form the quorum. Reduce the quorum size. @@ -138,7 +141,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl } // Step 1: Check if leader itself is requested to move out. - if (m_my_repl_id == member_out_uuid && m_my_repl_id == get_leader_id()) { + if (m_my_repl_id == member_out.id && m_my_repl_id == get_leader_id()) { // If leader is the member requested to move out, then give up leadership and return error. // Client will retry replace_member request to the new leader. raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); @@ -148,9 +151,9 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl } // Step 2. Add the new member. - return m_msg_mgr.add_member(m_group_id, member_in_uuid) + return m_msg_mgr.add_member(m_group_id, member_in.id) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in_uuid, member_out_uuid, commit_quorum](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_in, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout // when adding member. Member is added to cluster config until member syncs fully // with atleast stop gap. This will take a lot of time for block or @@ -168,18 +171,17 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl return make_async_error<>(RaftReplService::to_repl_error(e.error())); } } - auto member_out = boost::uuids::to_string(member_out_uuid); - auto member_in = boost::uuids::to_string(member_in_uuid); - RD_LOGI("Replace member added member={} to group_id={}", member_in, group_id_str()); + RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), + group_id_str()); // Step 3. Append log entry to mark the old member is out and new member is added. auto rreq = repl_req_ptr_t(new repl_req_ctx{}); replace_members_ctx members; - std::copy(member_in_uuid.begin(), member_in_uuid.end(), members.in_replica_id.begin()); - std::copy(member_out_uuid.begin(), member_out_uuid.end(), members.out_replica_id.begin()); - sisl::blob header(r_cast< uint8_t* >(&members), - members.in_replica_id.size() + members.out_replica_id.size()); + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); rreq->init( repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0); @@ -196,7 +198,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl // Step 4. Remove the old member. Even if the old member is temporarily // down and recovers, nuraft mesg see member remove from cluster log // entry and call exit_group() and leave(). - return m_msg_mgr.rem_member(m_group_id, member_out_uuid) + return m_msg_mgr.rem_member(m_group_id, member_out.id) .via(&folly::InlineExecutor::instance()) .thenValue([this, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { if (e.hasError()) { @@ -212,7 +214,8 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl return make_async_error<>(ReplServiceError::RETRY_REQUEST); } } else { - RD_LOGI("Replace member removed member={} from group_id={}", member_out, group_id_str()); + RD_LOGI("Replace member removed member={} from group_id={}", + boost::uuids::to_string(member_out.id), group_id_str()); } // Revert the quorum size back to 0. @@ -957,13 +960,11 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) void RaftReplDev::replace_member(repl_req_ptr_t rreq) { auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); - replica_id_t member_in, member_out; - std::copy(members->out_replica_id.begin(), members->out_replica_id.end(), member_out.begin()); - std::copy(members->in_replica_id.begin(), members->in_replica_id.end(), member_in.begin()); - RD_LOGI("Raft repl replace_member member_out={} member_in={}", boost::uuids::to_string(member_out), - boost::uuids::to_string(member_in)); - m_listener->replace_member(member_out, member_in); + RD_LOGI("Raft repl replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + + m_listener->on_replace_member(members->replica_out, members->replica_in); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1224,7 +1225,7 @@ void RaftReplDev::flush_durable_commit_lsn() { } /////////////////////////////////// Private metohds //////////////////////////////////// -void RaftReplDev::cp_flush(CP* cp, cshared ctx) { +void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { auto const lsn = ctx->cp_lsn; auto const clsn = ctx->compacted_to_lsn; auto const dsn = ctx->last_applied_dsn; @@ -1247,14 +1248,14 @@ void RaftReplDev::cp_flush(CP* cp, cshared ctx) { cp->to_string()); } -cshared RaftReplDev::get_cp_ctx(CP* cp) { +cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { auto const cp_lsn = m_commit_upto_lsn.load(); auto const clsn = m_compact_lsn.load(); auto const dsn = m_next_dsn.load(); - RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", - (void *)this, cp_lsn, clsn, dsn, cp->to_string()); - auto dev_ctx = std::make_shared(); + RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", (void*)this, cp_lsn, + clsn, dsn, cp->to_string()); + auto dev_ctx = std::make_shared< ReplDevCPContext >(); dev_ctx->cp_lsn = cp_lsn; dev_ctx->compacted_to_lsn = clsn; dev_ctx->last_applied_dsn = dsn; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 3b25cb23b..4be98394c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -36,8 +36,8 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); struct replace_members_ctx { - std::array< uint8_t, 16 > out_replica_id; - std::array< uint8_t, 16 > in_replica_id; + replica_member_info replica_out; + replica_member_info replica_in; }; class RaftReplDevMetrics : public sisl::MetricsGroup { @@ -162,7 +162,8 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum); + AsyncReplResult<> replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + uint32_t commit_quorum); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -199,8 +200,8 @@ class RaftReplDev : public ReplDev, sisl::blob const& key, uint32_t data_size, bool is_data_channel); folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs); - void cp_flush(CP* cp, cshared ctx); - cshared get_cp_ctx(CP* cp); + void cp_flush(CP* cp, cshared< ReplDevCPContext > ctx); + cshared< ReplDevCPContext > get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 8e5c9a7a1..9aa2c044d 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -147,8 +147,8 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum) const { +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 5e0cb84a3..acdff7bd4 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -73,8 +73,8 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum = 0) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index c4aefe1ca..0469d7829 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -85,12 +85,11 @@ void RaftReplService::start() { LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), params.mesg_port_); - //check if ssl cert files are provided, if yes, monitor the changes + // check if ssl cert files are provided, if yes, monitor the changes if (!params.ssl_key_.empty() && !params.ssl_cert_.empty()) { ioenvironment.with_file_watcher(); monitor_cert_changes(); } - // Step 2: Register all RAFT parameters. At the end of this step, raft is ready to be created/join group auto r_params = nuraft::raft_params() @@ -158,7 +157,7 @@ void RaftReplService::start() { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); rdev->wait_for_logstore_ready(); if (!rdev->join_group()) { - HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); + HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); it = m_rd_map.erase(it); } else { ++it; @@ -191,19 +190,19 @@ void RaftReplService::monitor_cert_changes() { restart_svc.detach(); }; - //monitor ssl cert file + // monitor ssl cert file if (!fw->register_listener(ioenvironment.get_ssl_cert(), "hs_ssl_cert_watcher", cert_change_cb)) { - LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", - "hs_ssl_cert_watcher", ioenvironment.get_ssl_cert()); + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_cert_watcher", + ioenvironment.get_ssl_cert()); } - //monitor ssl key file + // monitor ssl key file if (!fw->register_listener(ioenvironment.get_ssl_key(), "hs_ssl_key_watcher", cert_change_cb)) { - LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", - "hs_ssl_key_watcher", ioenvironment.get_ssl_key()); + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_key_watcher", + ioenvironment.get_ssl_key()); } } -void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted){ +void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted) { if (deleted && !wait_for_cert(filepath)) { LOGINFO("file {} deleted, ", filepath) // wait for the deleted file to be added again @@ -215,7 +214,7 @@ void RaftReplService::restart_raft_svc(const std::string filepath, const bool de } bool RaftReplService::wait_for_cert(const std::string& filepath) { - auto attempts = cert_change_timeout/cert_check_sleep; + auto attempts = cert_change_timeout / cert_check_sleep; for (auto i = attempts; i > 0; --i) { if (std::filesystem::exists(filepath)) { return true; } std::this_thread::sleep_for(cert_check_sleep); @@ -394,8 +393,8 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum) const { +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum) const { auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index e0d1e6718..9a53ad07d 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -51,7 +51,7 @@ class RaftReplService : public GenericReplService, iomgr::timer_handle_t m_flush_durable_commit_timer_hdl; iomgr::io_fiber_t m_reaper_fiber; std::mutex raft_restart_mutex; - + public: RaftReplService(cshared< ReplApplication >& repl_app); @@ -73,8 +73,8 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum = 0) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); @@ -98,12 +98,13 @@ struct ReplDevCPContext; class ReplSvcCPContext : public CPContext { std::shared_mutex m_cp_map_mtx; - std::map< ReplDev*, cshared > m_cp_ctx_map; + std::map< ReplDev*, cshared< ReplDevCPContext > > m_cp_ctx_map; + public: - ReplSvcCPContext(CP* cp) : CPContext(cp){}; + ReplSvcCPContext(CP* cp) : CPContext(cp) {}; virtual ~ReplSvcCPContext() = default; - int add_repl_dev_ctx(ReplDev* dev, cshared dev_ctx); - cshared get_repl_dev_ctx(ReplDev* dev); + int add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx); + cshared< ReplDevCPContext > get_repl_dev_ctx(ReplDev* dev); }; class RaftReplServiceCPHandler : public CPCallbacks { diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index a3160f13a..e0e2f6487 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -301,7 +301,10 @@ class TestReplicatedDB : public homestore::ReplDevListener { ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { return blk_alloc_hints{}; } - void replace_member(replica_id_t member_out, replica_id_t member_in) override {} + void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override { + LOGINFO("[Replica={}] replace member out {} in {}", g_helper->replica_num(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + } void on_destroy() override { LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), @@ -615,9 +618,9 @@ class RaftReplDevTestBase : public testing::Test { this->run_on_leader(db, [this, db, member_out, member_in, commit_quorum]() { LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); - auto v = hs()->repl_service() - .replace_member(db->repl_dev()->group_id(), member_out, member_in, commit_quorum) - .get(); + replica_member_info out{member_out, ""}; + replica_member_info in{member_in, ""}; + auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; }); } diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index c358f71ce..e446c3cd5 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -135,7 +135,7 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void replace_member(replica_id_t member_out, replica_id_t member_in) override {} + void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} void on_destroy() override {} }; From d0c4e2b280ead59d22c4d913e8d5bcd2f3bd9f5f Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:20:30 -0700 Subject: [PATCH 013/130] Add package version and show in log (#575) --- conanfile.py | 2 ++ src/lib/homestore.cpp | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/conanfile.py b/conanfile.py index 445bd4e0a..ab86cc420 100644 --- a/conanfile.py +++ b/conanfile.py @@ -94,6 +94,8 @@ def generate(self): tc.variables['BUILD_COVERAGE'] = 'ON' elif self.options.get_safe("sanitize"): tc.variables['MEMORY_SANITIZER_ON'] = 'ON' + tc.variables["CONAN_PACKAGE_NAME"] = self.name + tc.variables["CONAN_PACKAGE_VERSION"] = self.version tc.generate() # This generates "boost-config.cmake" and "grpc-config.cmake" etc in self.generators_folder diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index e2bbcbc21..d575b1767 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,7 @@ HomeStoreSafePtr HomeStore::s_instance{nullptr}; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; static shared< ChunkSelector > s_custom_chunk_selector{nullptr}; static shared< ReplApplication > s_repl_app{nullptr}; +std::string version = PACKAGE_VERSION; HomeStore* HomeStore::instance() { if (s_instance == nullptr) { s_instance = std::make_shared< HomeStore >(); } @@ -149,6 +151,12 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ static std::once_flag flag1; std::call_once(flag1, [this]() { +#ifndef NDEBUG + LOGINFO("HomeStore DEBUG version: {}", version); +#else + LOGINFO("HomeStore RELEASE version: {}", version); +#endif + sisl::VersionMgr::addVersion(PACKAGE_NAME, version::Semver200_version(PACKAGE_VERSION)); m_periodic_logger = sisl::logging::CreateCustomLogger("homestore", "_periodic", false, true /* tee_to_stdout_stderr */); sisl::logging::SetLogPattern("[%D %T.%f] [%^%L%$] [%t] %v", m_periodic_logger); From 1054b0025a056bc6feb6202e42ed0748c1d814d6 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 29 Oct 2024 09:57:45 +0800 Subject: [PATCH 014/130] add chunksize to vchunk interface (#572) --- conanfile.py | 2 +- src/include/homestore/vchunk.h | 1 + src/lib/device/vchunk.cpp | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index ab86cc420..a367d8548 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "5.2.2" + version = "6.5.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index b52832faa..0406d428f 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -35,6 +35,7 @@ class VChunk { uint32_t get_pdev_id() const; uint16_t get_chunk_id() const; cshared< Chunk > get_internal_chunk() const; + uint64_t size() const; private: shared< Chunk > m_internal_chunk; diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index 1a7aaeac5..26391ac1b 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -33,5 +33,7 @@ uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()-> uint16_t VChunk::get_chunk_id() const { return m_internal_chunk->chunk_id(); } +uint64_t VChunk::size() const { return m_internal_chunk->size(); } + cshared< Chunk > VChunk::get_internal_chunk() const { return m_internal_chunk; } } // namespace homestore From f6cd30f9658851869f4f104c4ec032c0d4150eee Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:10:39 +0800 Subject: [PATCH 015/130] Add index CR UT for basic merge (#556) Signed-off-by: Jilong Kou --- conanfile.py | 2 +- .../homestore/btree/detail/btree_node.hpp | 7 + src/lib/device/virtual_dev.cpp | 9 + src/lib/homestore.cpp | 2 - .../index/inplace_btree/inplace_btree_store.h | 120 ++++-- src/lib/index/inplace_btree/wb_cache.cpp | 166 +++++--- .../test_common/homestore_test_common.hpp | 7 +- src/tests/test_index_crash_recovery.cpp | 375 +++++++++++++----- src/tests/test_scripts/index_test.py | 1 + 9 files changed, 496 insertions(+), 193 deletions(-) diff --git a/conanfile.py b/conanfile.py index a367d8548..ab86cc420 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.3" + version = "5.2.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index b58174dc3..8bf83966c 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -364,6 +364,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { template < typename K > K get_first_key() const { + if (total_entries() == 0) { return K{}; } return get_nth_key< K >(0, true); } @@ -463,6 +464,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } + + // Should not happen + if (this->is_node_deleted()) { + fmt::format_to(std::back_inserter(str), " **DELETED** "); + } + return str; } diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 591540995..a3f060e4a 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -431,6 +431,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + dev_offset); if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -443,6 +445,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< if (hs()->crash_simulator().is_in_crashing_phase()) { return std::error_code{}; } #endif + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + chunk->start_offset() + offset_in_chunk); + if (sisl_unlikely(!is_chunk_available(chunk))) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -464,6 +469,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); @@ -486,6 +493,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index d575b1767..c04ff23bf 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -354,8 +354,6 @@ void HomeStore::shutdown() { #ifdef _PRERELEASE flip::Flip::instance().stop_rpc_server(); #endif - - HomeStore::reset_instance(); LOGINFO("Homestore is completed its shutdown"); } diff --git a/src/lib/index/inplace_btree/inplace_btree_store.h b/src/lib/index/inplace_btree/inplace_btree_store.h index 484901fa3..befede6da 100644 --- a/src/lib/index/inplace_btree/inplace_btree_store.h +++ b/src/lib/index/inplace_btree/inplace_btree_store.h @@ -101,7 +101,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } void destroy() override { - Btree< K, V >::destroy_btree(nullptr); + auto cpg = cp_mgr().cp_guard(); + Btree::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); } @@ -153,13 +154,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { idx_buf->m_dirtied_cp_id = cpg->id(); BtreeNodePtr bn = BtreeNodePtr{n}; - LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); - repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + // Only for interior nodes we need to repair its links + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); + repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + } if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the // meta_buf with new root as well - on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + on_root_changed(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); } } @@ -246,7 +250,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } - btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + btree_status_t + on_root_changed(BtreeNodePtr const &new_root, void *context) override { // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ // return btree_status_t::success;} m_sb->root_node = new_root->node_id(); @@ -258,12 +263,12 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf; - wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context)); + wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast(context)); return btree_status_t::success; } btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); + BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string()); // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs // to be handled. Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); @@ -273,7 +278,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { parent_node->node_id()); return btree_status_t::not_found; } - BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(), + + // Get all original child ids as a support to check if we are beyond the last child node + std::set orig_child_ids; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_ids.insert(link_info.bnode_id()); + } + BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), last_parent_key.to_string()); // Get the first child node and its link info @@ -298,22 +311,45 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cur_parent = parent_node; BtreeNodeList new_parent_nodes; do { - if (child_node->has_valid_edge() || - (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) { - BT_DBG_ASSERT(is_parent_edge_node, - "Child node={} is an edge node but parent_node={} is not an edge node", - child_node->node_id(), cur_parent->node_id()); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(), + child_node->to_string()); + child_node->set_next_bnode(empty_bnodeid); + write_node_impl(child_node, cp_ctx); + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } break; } auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), + BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), child_node->to_string(), child_last_key.to_string()); - if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) { - // We have reached the last key, and the parent node doesn't have edge, so we can stop now - break; + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is not. + // In this case, you may have your rightmost child node with last key greater than the last_parent_key. + // That's why here we have to check if the child node is one of the original child nodes first. + if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) { + if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + break; + } } if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), @@ -335,20 +371,37 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Insert the last key of the child node into parent node - cur_parent->insert(cur_parent->total_entries(), child_last_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (!child_node->is_node_deleted()) { + cur_parent->insert(cur_parent->total_entries(), + child_node->total_entries() > 0 ? child_last_key : last_parent_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->total_entries() == 0) { + // There should be at most one empty child node per parent - if we find one, we should stop here + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(), + child_node->to_string()); + break; + } + } else { + // Node deleted indicates it's freed & no longer used during recovery + BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + } - BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string()); + BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string()); // Move to the next child node - this->unlock_node(child_node, locktype_t::READ); auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); if (next_node_id == empty_bnodeid) { - BT_LOG_ASSERT(false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " - "repair is partial", - child_node->node_id(), parent_node->node_id()); - ret = btree_status_t::not_found; + // This can be a deleted edge node - only check if it is still valid + if (!child_node->is_node_deleted()) { + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " + "repair is partial", + child_node->node_id(), parent_node->node_id()); + ret = btree_status_t::not_found; + } + child_node = nullptr; break; } @@ -356,10 +409,21 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (ret != btree_status_t::success) { BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); + child_node = nullptr; break; } } while (true); - this->unlock_node(child_node, locktype_t::READ); + + if (child_node) { + this->unlock_node(child_node, locktype_t::READ); + } + + if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { + // We shouldn't have an empty interior node in the tree, let's delete it. + // The buf will be released by the caller + BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id()); + parent_node->set_node_deleted(); + } if (ret == btree_status_t::success) { ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index f75f4e63a..4966338cd 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -287,13 +287,8 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p } icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } else { - icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal - child_buf->m_up_buffer, // real up buffer - new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer - : new_node_bufs[0]->m_up_buffer, // real in place child - new_node_bufs, // new node bufs - freed_node_bufs // free_node_bufs - ); + icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf, + new_node_bufs, freed_node_bufs); } #ifdef _PRERELEASE // log new nodes and freed nodes and parent and child @@ -413,6 +408,22 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& } // Now we link the down_buffer to the real up_buffer + if (down_buf->m_up_buffer) { + // release existing up_buffer's wait count + down_buf->m_up_buffer->m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; + for (auto it = down_buf->m_up_buffer->m_down_buffers.begin(); it != down_buf->m_up_buffer->m_down_buffers.end(); + ++it) { + if (it->lock() == down_buf) { + down_buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + } real_up_buf->m_wait_for_down_buffers.increment(1); down_buf->m_up_buffer = real_up_buf; #ifndef NDEBUG @@ -426,13 +437,13 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { bool done = m_cache.remove(buf->m_blkid, node); HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); } - + buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); + m_vdev->free_blk(buf->m_blkid, s_cast(cp_ctx)); } //////////////////// Recovery Related section ///////////////////////////////// -void IndexWBCache::load_buf(IndexBufferPtr const& buf) { +void IndexWBCache::load_buf(IndexBufferPtr const &buf) { if (buf->m_bytes == nullptr) { buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); @@ -460,17 +471,17 @@ void IndexWBCache::recover(sisl::byte_view sb) { #ifdef _PRERELEASE auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { + std::vector const &pending_bufs) { std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const& [_, buf] : bufs) { + for (auto const &[_, buf]: bufs) { load_buf(buf); fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } // list of new_bufs - if (!l0_bufs.empty()) { - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { + if (!pending_bufs.empty()) { + fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size()); + for (auto const &buf: pending_bufs) { fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } } @@ -491,57 +502,79 @@ void IndexWBCache::recover(sisl::byte_view sb) { // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate // the same blkid which could clash with the blkid next in the buf list. // - // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed. - std::vector< IndexBufferPtr > l0_bufs; - for (auto const& [_, buf] : bufs) { - if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) { + // On the second pass, we only take part of the parents/siblings and then repair them, if needed. + std::vector pending_bufs; + std::vector deleted_bufs; + for (auto const &[_, buf]: bufs) { + if (buf->m_node_freed) { + // Freed node + load_buf(buf); if (was_node_committed(buf)) { - if (was_node_committed(buf->m_up_buffer)) { - if (buf->m_node_freed) { - // Up buffer was written, so this buffer can be freed and thus can free the blk. - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); - } else { - m_vdev->commit_blk(buf->m_blkid); - } - l0_bufs.push_back(buf); - } else { - buf->m_up_buffer->m_wait_for_down_buffers.decrement(); + // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link + r_cast(buf->m_bytes)->node_deleted = true; + write_buf(nullptr, buf, icp_ctx); + deleted_bufs.push_back(buf); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // (Up) buffer is not committed, node need to be kept and (potentially) repaired later + buf->m_node_freed = false; + if (buf->m_created_cp_id == icp_ctx->id()) { + // New nodes need to be commited first + m_vdev->commit_blk(buf->m_blkid); + } + pending_bufs.push_back(buf); + buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency + } + } else if (buf->m_created_cp_id == icp_ctx->id()) { + // New node + if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { + // Both current and up buffer is commited, we can safely commit the current block + m_vdev->commit_blk(buf->m_blkid); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // Just ignore it + buf->m_up_buffer->m_wait_for_down_buffers.decrement(); #ifndef NDEBUG - bool found{false}; - for (auto it = buf->m_up_buffer->m_down_buffers.begin(); - it != buf->m_up_buffer->m_down_buffers.end(); ++it) { - auto sp = it->lock(); - if (sp && sp == buf) { - found = true; - buf->m_up_buffer->m_down_buffers.erase(it); - break; - } + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); + it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + auto sp = it->lock(); + if (sp && sp == buf) { + found = true; + buf->m_up_buffer->m_down_buffers.erase(it); + break; } - HS_DBG_ASSERT(found, - "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif } + HS_DBG_ASSERT(found, + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif } } } #ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", - l0_bufs.size(), bufs.size(), icp_ctx->id()); - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); + pending_bufs.size(), bufs.size(), icp_ctx->id()); + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); #endif - // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be - // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in - // do_repair flag. - for (auto const& buf : l0_bufs) { - recover_buf(buf->m_up_buffer); + for (auto const &buf: pending_bufs) { + recover_buf(buf); + if (buf->m_bytes != nullptr && r_cast(buf->m_bytes)->node_deleted) { + // This buffer was marked as deleted during repair, so we also need to free it + deleted_bufs.push_back(buf); + } } + + for (auto const &buf: deleted_bufs) { + m_vdev->free_blk(buf->m_blkid, s_cast(icp_ctx)); + } + m_in_recovery = false; m_vdev->recovery_completed(); } -void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { +void IndexWBCache::recover_buf(IndexBufferPtr const &buf) { if (!buf->m_wait_for_down_buffers.decrement_testz()) { // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer return; @@ -636,6 +669,10 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE static std::once_flag flag; + if (hs()->crash_simulator().is_crashed()) { + std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); + return; + } if (buf->m_crash_flag_on) { std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; LOGINFO("Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); @@ -643,33 +680,34 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const hs()->crash_simulator().crash(); cp_ctx->complete(true); return; - } else if (hs()->crash_simulator().is_crashed()) { - std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); - return; } #endif buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { - LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), + LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + auto const &sb = r_cast(buf.get())->m_sb; + if (!sb.is_empty()) { + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); - m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) - .thenValue([buf, cp_ctx](auto) { - try { - auto& pthis = s_cast< IndexWBCache& >(wb_cache()); - pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } - }); + LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); + m_vdev->async_write(r_cast(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) + .thenValue([buf, cp_ctx](auto) { + try { + auto &pthis = s_cast(wb_cache()); + pthis.process_write_completion(cp_ctx, buf); + } catch (const std::runtime_error &e) { + LOGERROR("Failed to access write-back cache: {}", e.what()); + } + }); if (!part_of_batch) { m_vdev->submit_batch(); } } @@ -762,7 +800,7 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list - if ((*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 4df2a7231..1a690948e 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -194,8 +194,8 @@ class HSTestHelper { } homestore::HomeStore::instance()->shutdown(); + iomanager.stop(); // Stop iomanager first in case any fiber is still referencing homestore resources homestore::HomeStore::reset_instance(); - iomanager.stop(); if (cleanup) { remove_files(m_generated_devs); @@ -247,6 +247,11 @@ class HSTestHelper { m_fc.inject_delay_flip(flip_name, {null_cond}, freq, delay_usec); LOGDEBUG("Flip {} set", flip_name); } + + void remove_flip(const std::string flip_name) { + m_fc.remove_flip(flip_name); + LOGDEBUG("Flip {} removed", flip_name); + } #endif static void fill_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 9eefa486b..cac120a93 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -36,27 +36,29 @@ SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup SISL_OPTION_GROUP( test_index_crash_recovery, (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + ::cxxopts::value()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + ::cxxopts::value()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value()->default_value("360000"), "seconds"), (num_rounds, "", "num_rounds", "number of rounds to test with", - ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + ::cxxopts::value()->default_value("100"), "number"), (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", - ::cxxopts::value< uint32_t >()->default_value("40"), "number"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), - ""), + ::cxxopts::value()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", + ::cxxopts::value()->default_value("20"), ""), + (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", + ::cxxopts::value()->default_value("6"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + ::cxxopts::value()->default_value("1000"), "number"), (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), + ::cxxopts::value< bool >()->default_value("1"), ""), (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -98,10 +100,16 @@ class SequenceGenerator { keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); } + void fillRange(uint64_t start, uint64_t end) { + for (uint64_t i = start; i <= end; ++i) { + keyStates[i] = true; + } + } + OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } - for (size_t i = 0; i < numOperations; ++i) { + while (operations.size() < numOperations) { uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; @@ -119,6 +127,7 @@ class SequenceGenerator { return operations; } + __attribute__((noinline)) std::string showKeyState(uint64_t key) const { auto it = keyStates.find(key); if (it != keyStates.end()) { return it->second ? "Put" : "Remove"; } @@ -133,6 +142,7 @@ class SequenceGenerator { } return occurrences; } + __attribute__((noinline)) static std::string printOperations(const OperationList& operations) { std::ostringstream oss; auto count = 1; @@ -142,6 +152,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; @@ -155,6 +166,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); @@ -236,6 +248,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); return m_test->m_bt; } @@ -261,9 +274,11 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, nullptr, {}, SISL_OPTIONS["init_device"].as< bool >()); - LOGINFO("Node size {} ", hs()->index_service().node_size()); this->m_cfg = BtreeConfig(hs()->index_service().node_size()); this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); + LOGINFO("Node size {}, max_keys_in_node {}, min_keys_in_node {}", this->m_cfg.node_size(), + this->m_cfg.m_max_keys_in_node, this->m_cfg.m_min_keys_in_node); auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -300,7 +315,10 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void reset_btree() { + hs()->index_service().remove_index_table(this->m_bt); this->m_bt->destroy(); + this->trigger_cp(true); + auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); @@ -333,14 +351,21 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); - for (const auto& [k, addition] : diff) { + for (const auto &[k, addition]: diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); // this->visualize_keys(recovered_tree_filename); - if (addition) { this->force_upsert(k.key()); } + if (addition) { + LOGDEBUG("Reapply: Inserting key {}", k.key()); + this->force_upsert(k.key()); + } else { + LOGDEBUG("Reapply: Removing key {}", k.key()); + this->remove_one(k.key(), false); + } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); this->m_shadow_map.save(m_shadow_filename); } + void reapply_after_crash(OperationList& operations) { for (const auto& [key, opType] : operations) { switch (opType) { @@ -354,7 +379,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT break; } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); } void TearDown() override { @@ -376,13 +401,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void crash_and_recover(uint32_t s_key, uint32_t e_key) { this->print_keys("Btree prior to CP and susbsequent simulated crash: "); - test_common::HSTestHelper::trigger_cp(false); + trigger_cp(false); this->wait_for_crash_recovery(); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); this->print_keys("Post crash and recovery, btree structure: "); this->reapply_after_crash(); + this->print_keys("Post reapply, btree structure: "); + this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch"; @@ -419,7 +446,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->visualize_keys(b_filename); } - test_common::HSTestHelper::trigger_cp(false); + trigger_cp(false); LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(); @@ -427,8 +454,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string rec_filename = filename + "_after_recovery.dot"; LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); - this->print_keys("Post crash and recovery, btree structure: "); } + this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -439,8 +466,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string re_filename = filename + "_after_reapply.dot"; LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); -// this->print_keys("Post crash and recovery, btree structure: "); } + this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -528,82 +555,6 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { this->query_all_paginate(80); } -/* -TYPED_TEST(IndexCrashTest, ManualMergeCrash){ - // Define the lambda function - const uint32_t num_entries = 30; - - auto initTree = [this, num_entries]() { - for (uint64_t k = 0u; k < num_entries; ++k) { - this->force_upsert(k); - } - test_common::HSTestHelper::trigger_cp(true); - this->m_shadow_map.save(this->m_shadow_filename); - }; - - std::vector< OperationList > removing_scenarios = { - {{29, OperationType::Remove}, - {28, OperationType::Remove}, - {27, OperationType::Remove}, - {26, OperationType::Remove}, - {25, OperationType::Remove}, - {24, OperationType::Remove}} - }; - - auto scenario = removing_scenarios[0]; - - LOGINFO("Step 1-1: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init.dot"); - LOGINFO("Step 2-1: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_parent"); - - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash.dot"); - - LOGINFO("Step 3-1: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_1.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-2: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_02.dot"); - LOGINFO("Step 2-2: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_left_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash_2.dot"); - - LOGINFO("Step 3-2: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_2.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-3: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_03.dot"); - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_freed_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->visualize_keys("tree_before_crash_3.dot"); - - LOGINFO("Step 3-3: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_3.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); -} -*/ - TYPED_TEST(IndexCrashTest, SplitCrash1) { // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -768,6 +719,236 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { if (renew_btree_after_crash) { this->reset_btree(); }; } } + +// Basic reverse and forward order remove with different flip points +TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { + vector flip_points = { + "crash_flush_on_merge_at_parent", + "crash_flush_on_merge_at_left_child", + // "crash_flush_on_freed_child", + }; + + for (size_t i = 0; i < flip_points.size(); ++i) { + this->reset_btree(); + + auto &flip_point = flip_points[i]; + LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); + + // Populate some keys [1,num_entries) and trigger cp to persist + LOGINFO("Step {}-1: Populate some keys and flush", i+1); + auto const num_entries = SISL_OPTIONS["num_entries"].as(); + for (auto k = 0u; k < num_entries; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + test_common::HSTestHelper::trigger_cp(true); + this->m_shadow_map.save(this->m_shadow_filename); + + this->visualize_keys("tree_merge_full.dot"); + + // Split keys into batches and remove the last one in reverse order + LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); + int batch_num = 4; { + int n = batch_num; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = r; k >= l; --k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_first_crash.dot"); + + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in forward order + LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { + int n = batch_num - 1; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = l; k <= r; ++k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_second_crash.dot"); + + LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in random order + LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { + int n = batch_num - 2; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + SequenceGenerator generator(0, 100, l, r); + generator.fillRange(l, r); + OperationList ops = generator.generateOperations(r - l + 1, false); + + LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_third_crash.dot"); + + LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + LOGINFO("Step {}-5: Cleanup the tree", i + 1); + for (auto k = 0u; k < num_entries; ++k) { + this->remove_one(k, false); + } + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } +} + +// +// TYPED_TEST(IndexCrashTest, MergeCrash1) { +// auto const num_entries = SISL_OPTIONS["num_entries"].as(); +// vector flips = { +// "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", +// }; +// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); +// OperationList operations; +// for (size_t i = 0; i < flips.size(); ++i) { +// this->reset_btree(); +// LOGINFO("Step {}-1: Init btree", i + 1); +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->print_keys("Inited tree"); +// +// LOGINFO("Step {}-2: Set flag {}", i + 1, flips[i]); +// this->set_basic_flip(flips[i], 1, 10); +// generator.reset(); +// generator.fillRange(0, num_entries - 1); +// +// // Randomly remove some keys +// std::random_device rd; +// std::mt19937 gen(rd()); +// std::uniform_int_distribution<> dis(num_entries / 4, num_entries / 2); +// auto num_keys_to_remove = dis(gen); +// LOGINFO("Removing {} keys before crash", num_keys_to_remove); +// operations = generator.generateOperations(num_keys_to_remove, false /* reset */); +// for (auto [k, _]: operations) { +// LOGINFO("Removing key {}", k); +// this->remove_one(k, true); +// } +// +// LOGINFO("Step {}-3: Simulate crash and recover", i + 1); +// this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); +// } +// } +// +// TYPED_TEST(IndexCrashTest, MergeManualCrash) { +// std::vector flip_points = { +// "crash_flush_on_merge_at_parent", +// "crash_flush_on_merge_at_left_child", +// }; +// +// constexpr uint32_t num_entries = 28; // with max=5 & min=3 +// +// auto initTree = [this, num_entries]() { +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->m_shadow_map.save(this->m_shadow_filename); +// }; +// +// std::vector removing_scenarios = { +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// }, // Merge 2 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {21, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 2 actions +// { +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {11, OperationType::Remove}, +// {10, OperationType::Remove}, +// {13, OperationType::Remove}, +// }, // Merge from level=0 then level=1 +// // { +// // {16, OperationType::Remove}, +// // }, // Merge from level=1 then level=0 - need to set min=4 +// }; +// +// for (int i = 0; i < static_cast(removing_scenarios.size()); i++) { +// auto scenario = removing_scenarios[i]; +// auto s_idx = i + 1; +// LOGINFO("\n\tTesting scenario {}", s_idx); +// for (int j = 0; j < static_cast(flip_points.size()); j++) { +// const auto &flip_point = flip_points[j]; +// auto f_idx = j + 1; +// LOGINFO("\n\t\t\t\tTesting flip point: {}", flip_point); +// +// LOGINFO("Step {}-{}-1: Populate keys and flush", s_idx, f_idx); +// initTree(); +// this->visualize_keys(fmt::format("tree_init.{}_{}.dot", s_idx, f_idx)); +// +// LOGINFO("Step {}-{}-2: Set crash flag, remove keys in reverse order", s_idx, f_idx); +// this->set_basic_flip(flip_point); +// for (auto k: scenario) { +// LOGINFO("Removing entry {}", k.first); +// this->remove_one(k.first); +// } +// this->visualize_keys(fmt::format("tree_before_first_crash.{}_{}.dot", s_idx, f_idx)); +// this->remove_flip(flip_point); +// +// LOGINFO("Step {}-{}-3: Trigger cp to crash", s_idx, f_idx); +// this->crash_and_recover(scenario); +// test_common::HSTestHelper::trigger_cp(true); +// this->get_all(); +// +// this->reset_btree(); +// test_common::HSTestHelper::trigger_cp(true); +// } +// } +// } #endif int main(int argc, char* argv[]) { diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 02c3e4c2c..dd2f8f010 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -52,6 +52,7 @@ def parse_arguments(): parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--min_keys_in_node', help='Minimum num of keys in btree nodes', type=int, default=2) parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60) From b4da34e5bb7b1b376fd8996ab1900e791c8ceaef Mon Sep 17 00:00:00 2001 From: Sanal Date: Fri, 1 Nov 2024 10:22:40 -0700 Subject: [PATCH 016/130] Add additional tests for replace member (#574) --- .../repl_dev/raft_state_machine.cpp | 10 + src/tests/test_common/hs_repl_test_common.hpp | 9 + src/tests/test_common/raft_repl_test_base.hpp | 24 ++- src/tests/test_raft_repl_dev_dynamic.cpp | 182 ++++++++++++++++-- 4 files changed, 206 insertions(+), 19 deletions(-) diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 0b932bbe1..e801d9511 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -9,6 +10,7 @@ #include "repl_dev/raft_repl_dev.h" #include #include "common/homestore_config.hpp" +#include "common/crash_simulator.hpp" namespace homestore { @@ -291,6 +293,14 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, // Update the object offset. obj_id = snp_data->offset; + +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) { + LOGINFO("Hit flip baseline_resync_restart_new_follower crashing"); + hs()->crash_simulator().crash(); + return; + } +#endif } bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 672acffcb..c9ff71567 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -252,6 +252,10 @@ class HSReplTestHelper : public HSTestHelper { start_homestore(); } + void reinit_repl_app() { + m_token.params(HS_SERVICE::REPLICATION).repl_app = std::make_unique< TestReplApplication >(*this); + } + uint16_t replica_num() const { return replica_num_; } homestore::replica_id_t my_replica_id() const { return my_replica_id_; } homestore::replica_id_t replica_id(uint16_t member_id) const { @@ -317,6 +321,11 @@ class HSReplTestHelper : public HSTestHelper { } } + void add_listener(std::shared_ptr< ReplDevListener > listener) { + std::unique_lock lg(groups_mtx_); + pending_listeners_.emplace_back(listener); + } + size_t num_listeners() const { std::unique_lock lg(groups_mtx_); return repl_groups_.size(); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index e0e2f6487..1ab90143a 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -204,7 +204,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { kv_snapshot_data.emplace_back(Key{v.id_}, v); LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); - if (kv_snapshot_data.size() >= 1000) { break; } + if (kv_snapshot_data.size() >= 10) { break; } } if (kv_snapshot_data.size() == 0) { @@ -430,6 +430,7 @@ class RaftReplDevTestBase : public testing::Test { for (auto const& db : dbs_) { if (db->is_zombie()) { continue; } auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + if (!repl_dev) continue; int i = 0; bool force_leave = false; do { @@ -511,6 +512,11 @@ class RaftReplDevTestBase : public testing::Test { } void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { + if (!db || !db->repl_dev()) { + // Spare which are not added to group will not have repl dev. + return; + } + do { auto leader_uuid = db->repl_dev()->get_leader_id(); @@ -527,6 +533,8 @@ class RaftReplDevTestBase : public testing::Test { } void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + if (dbs_[0]->repl_dev() == nullptr) return; + do { auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); @@ -614,14 +622,20 @@ class RaftReplDevTestBase : public testing::Test { void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum = 0) { - this->run_on_leader(db, [this, db, member_out, member_in, commit_quorum]() { + uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { + this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); + replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); - ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; + auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + if (error == ReplServiceError::OK) { + ASSERT_EQ(result.hasError(), false) << "Error in replacing member"; + } else { + ASSERT_EQ(result.hasError(), true) << "Error in replacing member"; + ASSERT_EQ(result.error(), error); + } }); } diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index c29f239e1..5a6095959 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -15,11 +15,17 @@ #include "test_common/raft_repl_test_base.hpp" // Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev. -class ReplDevDynamicTest : public RaftReplDevTestBase {}; +class ReplDevDynamicTest : public RaftReplDevTestBase { +private: + bool is_replica_num_in(const std::set< uint32_t >& replicas) { + // Check if the current replica process is in this set. + return replicas.count(g_helper->replica_num()) != 0 ? true : false; + } +}; TEST_F(ReplDevDynamicTest, ReplaceMember) { + LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num()); // Write some IO's, replace a member, validate all members data except which is out. - LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); auto db = dbs_.back(); auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); @@ -45,28 +51,28 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_verify_start(num_members); LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); - if (g_helper->replica_num() != member_out) { + if (is_replica_num_in({0, 1, member_in})) { // Skip the member which is going to be replaced. Validate data on all other replica's. LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); - } else { + } else if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); - do { + while (repl_dev && !repl_dev->is_destroyed()) { std::this_thread::sleep_for(std::chrono::seconds(1)); auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); raft_repl_svc.gc_repl_devs(); LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); - } while (!repl_dev->is_destroyed()); + } LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); } g_helper->sync_for_cleanup_start(num_members); - LOGINFO("ReplaceMember test done"); + LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num()); } TEST_F(ReplDevDynamicTest, TwoMemberDown) { - LOGINFO("TwoMemberDown test started"); + LOGINFO("TwoMemberDown test started replica={}", g_helper->replica_num()); // Make two members down in a group and leader cant reach a quorum. // We set the custom quorum size to 1 and call replace member. @@ -110,28 +116,176 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("Member in got all commits"); } - if (g_helper->replica_num() == 0 || g_helper->replica_num() == member_in) { + if (is_replica_num_in({0, member_in})) { // Validate data on leader replica 0 and replica 3 LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); } - g_helper->sync_for_cleanup_start(num_members); - if (g_helper->replica_num() == 1) { LOGINFO("Start replica 1"); + db->set_zombie(); this->start_replica(1); } if (g_helper->replica_num() == 2) { LOGINFO("Start replica 2"); + db->set_zombie(); + this->start_replica(2); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, OneMemberDown) { + // replica0(leader) and replica1 up, replica2 is down. Replace replica2 with replica3. + // replica0 should be able to baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("OneMemberDown test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); + + std::this_thread::sleep_for(std::chrono::seconds(3)); + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_cleanup_start(num_members); + + if (g_helper->replica_num() == 2) { + LOGINFO("Start replica 2"); + db->set_zombie(); this->start_replica(2); } - LOGINFO("TwoMemberDown test done"); + LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num()); } -// TODO add more tests with leader and member restart, multiple member replace -// leader replace +TEST_F(ReplDevDynamicTest, LeaderReplace) { + // replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3. + // replica0 will yield leadership and any other replica will be come leader and leader + // will do baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("LeaderReplace test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the leader in the group with index(0) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = 0; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + if (g_helper->replica_num() != member_in) { + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + // Leader will return error NOT_LEADER and yield leadership, sleep and connect again + // to the new leader. + LOGINFO("Replace old leader"); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + ReplServiceError::NOT_LEADER); + LOGINFO("Replace member leader yield done"); + + std::this_thread::sleep_for(std::chrono::seconds(3)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + LOGINFO("Replace member old leader done"); + } + + if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + if (g_helper->replica_num() == member_out) { db->set_zombie(); } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, OneMemberRestart) { + // replica0(leader) is up and replica1 is restated, replica2 is down. Replace replica2 with replica3. + // replica0 should be able to baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("OneMemberRestart test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + if (g_helper->replica_num() == 1) { + LOGINFO("Restart replica 1"); + this->restart_replica(15); + } + + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); +} int main(int argc, char* argv[]) { int parsed_argc = argc; From 0a47df55b268f7a4d366d5b9e8dc1ef15fd0c458 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Tue, 5 Nov 2024 15:09:31 +0800 Subject: [PATCH 017/130] Add protection for concurrent access to m_down_buffers (#577) Concurrent writes to m_down_buffers may cause data inconsistency. Add a mutex lock to IndexBuffer as well as extracting add/remove operations into member functions to make the vector thread-safe. Signed-off-by: Jilong Kou --- src/lib/index/inplace_btree/wb_cache.cpp | 51 +++++------------------- 1 file changed, 10 insertions(+), 41 deletions(-) diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index 4966338cd..9ba839edc 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -394,14 +394,8 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& HS_DBG_ASSERT((real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()), "Up buffer is not modified by current cp, but down buffer is linked to it"); #ifndef NDEBUG - bool found{false}; - for (auto const& dbuf : real_up_buf->m_down_buffers) { - if (dbuf.lock() == down_buf) { - found = true; - break; - } - } - HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); + HS_DBG_ASSERT(real_up_buf->is_in_down_buffers(down_buf), + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); #endif return; } @@ -410,25 +404,10 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& // Now we link the down_buffer to the real up_buffer if (down_buf->m_up_buffer) { // release existing up_buffer's wait count - down_buf->m_up_buffer->m_wait_for_down_buffers.decrement(); -#ifndef NDEBUG - bool found{false}; - for (auto it = down_buf->m_up_buffer->m_down_buffers.begin(); it != down_buf->m_up_buffer->m_down_buffers.end(); - ++it) { - if (it->lock() == down_buf) { - down_buf->m_up_buffer->m_down_buffers.erase(it); - found = true; - break; - } - } - HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif + down_buf->m_up_buffer->remove_down_buffer(down_buf); } - real_up_buf->m_wait_for_down_buffers.increment(1); down_buf->m_up_buffer = real_up_buf; -#ifndef NDEBUG - real_up_buf->m_down_buffers.emplace_back(down_buf); -#endif + real_up_buf->add_down_buffer(down_buf); } void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { @@ -533,21 +512,8 @@ void IndexWBCache::recover(sisl::byte_view sb) { pending_bufs.push_back(buf->m_up_buffer); } else { // Just ignore it - buf->m_up_buffer->m_wait_for_down_buffers.decrement(); -#ifndef NDEBUG - bool found{false}; - for (auto it = buf->m_up_buffer->m_down_buffers.begin(); - it != buf->m_up_buffer->m_down_buffers.end(); ++it) { - auto sp = it->lock(); - if (sp && sp == buf) { - found = true; - buf->m_up_buffer->m_down_buffers.erase(it); - break; - } - } - HS_DBG_ASSERT(found, - "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif + buf->m_up_buffer->remove_down_buffer(buf); + buf->m_up_buffer = nullptr; } } } @@ -752,7 +718,10 @@ std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(Index IndexBufferPtr const& buf) { IndexBufferPtrList buf_list; #ifndef NDEBUG - buf->m_down_buffers.clear(); + { + std::lock_guard lg(buf->m_down_buffers_mtx); + buf->m_down_buffers.clear(); + } #endif buf->set_state(index_buf_state_t::CLEAN); From 9f3220356d568ac0ff4693e7e711f4f5ef527811 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 6 Nov 2024 17:37:42 +0800 Subject: [PATCH 018/130] Implement GC_REPL_REQ Based on DSN to Prevent Resource Leaks (#576) * Implement GC_REPL_REQ Based on DSN to Prevent Resource Leaks This commit introduces a mechanism to garbage collect (GC) replication requests (rreqs) that may hang indefinitely, thereby consuming memory and disk resources unnecessarily. These rreqs can enter a hanging state under several circumstances, as outlined below: 1. Scenario with Delayed Commit: - Follower F1 receives LSN 100 and DSN 104 from Leader L1 and takes longer than the raft timeout to precommit/commit it. - L1 resends LSN 100, causing F1 to fetch the data again. Since LSN 100 was committed in a previous attempt, this log entry is skipped, leaving the rreq hanging indefinitely. 2. Scenario with Leader Failure Before Data Completion: - Follower F1 receives LSN 100 from L1, but before all data is fetched/pushed, L1 fails and L2 becomes the new leader. - L2 resends LSN 100 with L2 as the new originator. F1 proceeds with the new rreq and commits it, but the initial rreq from L1 hangs indefinitely as it cannot fetch data from the new leader L2. 3. Scenario with Leader Failure After Data Write: - Follower F1 receives data (DSN 104) from L1 and writes it. Before the log of LSN 100 reaches F1, L1 fails and L2 becomes the new leader. - L2 resends LSN 100 to F1, and F1 fetches DSN 104 from L2, leaving the original rreq hanging. This garbage collection process cleans up based on DSN. Any rreqs in `m_repl_key_req_map`, whose DSN is already committed (`rreq->dsn < repl_dev->m_next_dsn`), will be GC'd. This is safe on the follower side, as the follower updates `m_next_dsn` during commit. Any DSN below `cur_dsn` should already be committed, implying that the rreq should already be removed from `m_repl_key_req_map`. On the leader side, since `m_next_dsn` is updated when sending out the proposal, it is not safe to clean up based on `m_next_dsn`. Therefore, we explicitly skip the leader in this GC process. Skipping localize raft logs we already committed. Leader may send duplicate raft logs, if we localize them unconditionally duplicate data will be written to chunk during fetch_data. It is safe for us to skip those logs that already committed, there is no way those LSN can be over-written. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 97 ++++++++++++++----- .../repl_dev/raft_state_machine.cpp | 11 ++- .../replication/repl_dev/raft_state_machine.h | 2 +- 4 files changed, 79 insertions(+), 33 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index c2223455f..1abf5ea12 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -126,7 +126,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: friend class SoloReplDev; public: - repl_req_ctx() {} + repl_req_ctx() { m_start_time = Clock::now(); } virtual ~repl_req_ctx(); void init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f3a4a2461..b1ff61dbb 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -895,7 +895,7 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { // Remove the request from repl_key map. m_repl_key_req_map.erase(rreq->rkey()); // Remove the request from lsn map. - m_state_machine->unlink_lsn_to_req(rreq->lsn()); + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); auto cur_dsn = m_next_dsn.load(std::memory_order_relaxed); while (cur_dsn <= rreq->dsn()) { @@ -1191,9 +1191,22 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu entries.size()); auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); - for (auto& entry : entries) { + auto last_commit_lsn = uint64_cast(get_last_commit_lsn()); + for (unsigned long i = 0; i < entries.size(); i++) { + auto& entry = entries[i]; + auto lsn = start_lsn + i; + auto term = entry->get_term(); if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } if (entry->get_buf_ptr()->size() == 0) { continue; } + // skipping localize for already committed log(dup), they anyway will be discard + // by nuraft before append_log. + if (lsn <= last_commit_lsn) { + RD_LOGT("Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn, + last_commit_lsn); + continue; + } + // Those LSNs already in logstore but not yet committed, will be dedup here, + // applier_create_req will return same req as previous one auto req = m_state_machine->localize_journal_entry_prepare(*entry); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); @@ -1265,39 +1278,71 @@ cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { void RaftReplDev::cp_cleanup(CP*) {} void RaftReplDev::gc_repl_reqs() { - std::vector< int64_t > expired_keys; - m_state_machine->iterate_repl_reqs([this, &expired_keys](auto key, auto rreq) { + auto cur_dsn = m_next_dsn.load(); + if (cur_dsn != 0) cur_dsn = cur_dsn - 1; + // On follower, DSN below cur_dsn should very likely be commited. + // It is not guaranteed because DSN and LSN are generated separately, + // DSN in async_alloc_write before pushing data, LSN later when + // proposing to raft. Two simultaneous write requests on leader can have + // and during the window. + std::vector< repl_req_ptr_t > expired_rreqs; + + auto req_map_size = m_repl_key_req_map.size(); + RD_LOGI("m_repl_key_req_map size is {};", req_map_size); + for (auto [key, rreq] : m_repl_key_req_map) { + // FIXME: Skipping proposer for now, the DSN in proposer increased in proposing stage, not when commit(). + // Need other mechanism. + if (rreq->is_proposer()) { + // don't clean up proposer's request + continue; + } + if (rreq->dsn() < cur_dsn && rreq->is_expired()) { + // The DSN can be out of order, wait till rreq expired. + RD_LOGD("legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}", + rreq->to_string(), rreq->dsn(), cur_dsn, cur_dsn - rreq->dsn(), + get_elapsed_time_sec(rreq->created_time())); + expired_rreqs.push_back(rreq); + } + } + int sm_req_cnt = 0; + // FIXME: we ensured data written before appending log to log store, in which we add rreq to state_machine + // and during pre-commit/commit we retrieve rreq from state_machine. Removing requests outside of state + // machine is risky. + // Below logs are logging only, can be removed once we get more confidence. + m_state_machine->iterate_repl_reqs([this, cur_dsn, &sm_req_cnt](auto key, auto rreq) { + sm_req_cnt++; if (rreq->is_proposer()) { // don't clean up proposer's request return; } - if (rreq->is_expired()) { - expired_keys.push_back(key); - RD_LOGD("rreq=[{}] is expired, cleaning up; elapsed_time_sec{};", rreq->to_string(), + RD_LOGD("StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(), get_elapsed_time_sec(rreq->created_time())); - - // do garbage collection - // 1. free the allocated blocks - if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { - auto blkid = rreq->local_blkid(); - data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { - HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", - blkid.to_string()); - RD_LOGD("blkid={} freed successfully", blkid.to_string()); - }); - } - - // 2. remove from the m_repl_key_req_map - // handle_error during fetch data response might have already removed the rreq from the this map - if (m_repl_key_req_map.find(rreq->rkey()) != m_repl_key_req_map.end()) { - m_repl_key_req_map.erase(rreq->rkey()); - } } }); + RD_LOGI("state_machine req map size is {};", sm_req_cnt); - for (auto const& l : expired_keys) { - m_state_machine->unlink_lsn_to_req(l); + for (auto removing_rreq : expired_rreqs) { + // once log flushed, the commit progress controlled by raft + if (removing_rreq->has_state(repl_req_state_t::LOG_FLUSHED)) { + RD_LOGI("Skipping GC rreq [{}] because it is in state machine", removing_rreq->to_string()); + continue; + } + // do garbage collection + // 1. free the allocated blocks + RD_LOGI("Removing rreq [{}]", removing_rreq->to_string()); + if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = removing_rreq->local_blkid(); + data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD("GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + }); + } + // 2. remove from the m_repl_key_req_map + if (m_repl_key_req_map.find(removing_rreq->rkey()) != m_repl_key_req_map.end()) { + m_repl_key_req_map.erase(removing_rreq->rkey()); + } } } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index e801d9511..8a1581c0d 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -219,11 +219,12 @@ uint64_t RaftStateMachine::last_commit_index() { void RaftStateMachine::become_ready() { m_rd.become_ready(); } -void RaftStateMachine::unlink_lsn_to_req(int64_t lsn) { - auto const it = m_lsn_req_map.find(lsn); - if (it != m_lsn_req_map.cend()) { - RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, it->second->to_string()); - m_lsn_req_map.erase(lsn); +void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) { + // it is possible a LSN mapped to different rreq in history + // due to log overwritten. Verify the rreq before removing + auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq); + if (deleted) { + RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } } diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index b931e42f4..a19d9a0ec 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -126,7 +126,7 @@ class RaftStateMachine : public nuraft::state_machine { repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry); repl_req_ptr_t localize_journal_entry_finish(nuraft::log_entry& lentry); void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn); - void unlink_lsn_to_req(int64_t lsn); + void unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq); repl_req_ptr_t lsn_to_req(int64_t lsn); nuraft_mesg::repl_service_ctx* group_msg_service(); From 79820ad8d151e5587196c4d2e8e0c5db1810208e Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 7 Nov 2024 12:01:22 +0800 Subject: [PATCH 019/130] Releasing data buf from memory after written to disk. Data buffer persists in memory until rreq is committed or rolled back. This approach poses issues during recovery. As new data arrives via push_data and is written to disk, it remains in memory for an extended period until the replica catches up and commits the rreq. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 6 +++++- src/lib/replication/repl_dev/common.cpp | 9 ++++++++- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 ++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 1abf5ea12..cf0e00a0c 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -143,7 +143,10 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: sisl::blob const& key() const { return m_key; } MultiBlkId const& local_blkid() const { return m_local_blkid; } RemoteBlkId const& remote_blkid() const { return m_remote_blkid; } - const char* data() const { return r_cast< const char* >(m_data); } + const char* data() const { + DEBUG_ASSERT(m_data != nullptr, "m_data is nullptr, use before save_pushed/fetched_data or after release_data()"); + return r_cast< const char* >(m_data); + } repl_req_state_t state() const { return repl_req_state_t(m_state.load()); } bool has_state(repl_req_state_t s) const { return m_state.load() & uint32_cast(s); } repl_journal_entry const* journal_entry() const { return m_journal_entry; } @@ -209,6 +212,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool add_state_if_not_already(repl_req_state_t s); void set_lentry(nuraft::ptr< nuraft::log_entry > const& lentry) { m_lentry = lentry; } void clear(); + void release_data(); flatbuffers::FlatBufferBuilder& create_fb_builder() { return m_fb_builder; } void release_fb_builder() { m_fb_builder.Release(); } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index b8800afea..4fcbb0f4e 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -164,12 +164,19 @@ bool repl_req_ctx::add_state_if_not_already(repl_req_state_t s) { void repl_req_ctx::clear() { m_header = sisl::blob{}; m_key = sisl::blob{}; + release_data(); + m_pkts.clear(); +} + +void repl_req_ctx::release_data() { + m_data = nullptr; + // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here + m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { m_pushed_data->send_response(); m_pushed_data = nullptr; } m_fetched_data = sisl::GenericClientResponse{}; - m_pkts.clear(); } static std::string req_state_name(uint32_t state) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b1ff61dbb..59916d039 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -444,6 +444,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d } else { rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); + rreq->release_data(); const auto data_log_diff_us = push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count() ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time) @@ -862,6 +863,7 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons "Error in writing data"); // TODO: Find a way to return error to the Listener rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); + rreq->release_data(); RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " "total_write_latency_us={}, write_num_pieces={}", From 28ea01cfe672125b1175629682fb2ece0faea447 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Mon, 11 Nov 2024 15:35:53 +0800 Subject: [PATCH 020/130] add rollback on state machine add open Leader_Restart ut (#585) * add rollback on state machine --------- Signed-off-by: yawzhang --- .../log_store/home_raft_log_store.cpp | 16 ++++++++++++++++ .../replication/repl_dev/raft_repl_dev.cpp | 19 +++++++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 1 + .../repl_dev/raft_state_machine.cpp | 19 ++++++++++++++++++- .../replication/repl_dev/raft_state_machine.h | 3 ++- src/tests/test_raft_repl_dev.cpp | 2 +- 6 files changed, 57 insertions(+), 3 deletions(-) diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 4d80987d1..dc878924d 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -182,6 +182,22 @@ void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& e m_log_store->append_async(sisl::io_blob{buf->data_begin(), uint32_cast(buf->size()), false /* is_aligned */}, nullptr /* cookie */, [buf](int64_t, sisl::io_blob&, logdev_key, void*) {}); + + auto position_in_cache = index % m_log_entry_cache.size(); + { + std::unique_lock lk(m_mutex); + m_log_entry_cache[position_in_cache] = std::make_pair(index, entry); + + // remove all cached entries after this index + for (size_t i{0}; i < m_log_entry_cache.size(); ++i) { + if (m_log_entry_cache[i].first > index) { + m_log_entry_cache[i] = std::make_pair(0, nullptr); + } + } + } + + // flushing the log before returning to ensure new(over-written) log is persisted to disk. + end_of_append_batch(index, 1); } void HomeRaftLogStore::end_of_append_batch(ulong start, ulong cnt) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 59916d039..dea117736 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -891,6 +891,25 @@ void RaftReplDev::commit_blk(repl_req_ptr_t rreq) { } } +void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { + // 1. call the listener to rollback + m_listener->on_rollback(rreq->lsn(), rreq->header(), rreq->key(), rreq); + + // 2. remove the request from maps + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); + m_repl_key_req_map.erase(rreq->rkey()); + + // 3. free the allocated blocks + if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = rreq->local_blkid(); + data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD("Rollback rreq: Releasing blkid={} freed successfully", blkid.to_string()); + }); + } +} + void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { commit_blk(rreq); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 4be98394c..5cb1516ac 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -195,6 +195,7 @@ class RaftReplDev : public ReplDev, //////////////// Methods needed for other Raft classes to access ///////////////// void use_config(json_superblk raft_config_sb); void handle_commit(repl_req_ptr_t rreq, bool recovery = false); + void handle_rollback(repl_req_ptr_t rreq); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, bool is_data_channel); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 8a1581c0d..39df73a0e 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -206,6 +206,23 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt // TODO:add more logic here if necessary } +void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { + RD_LOGD("Raft channel: Rollback cluster conf , log_idx = {}", log_idx); + // TODO:add more logic here if necessary +} + +void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& params) { + int64_t lsn = s_cast< int64_t >(params.log_idx); + repl_req_ptr_t rreq = lsn_to_req(lsn); + if (rreq == nullptr) { + RD_LOG(ERROR, "Raft channel: Rollback lsn {} rreq not found", lsn); + return; + } + + RD_LOGD("Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string()); + m_rd.handle_rollback(rreq); +} + void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb) { for (auto [key, rreq] : m_lsn_req_map) { cb(key, rreq); @@ -234,7 +251,7 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { // reset the rreq created_at time to now https://github.com/eBay/HomeStore/issues/506 rreq->set_created_time(); [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); - RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list", lsn); + RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list, exist_term={}", lsn, r.first->second->term()); } repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) { diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index a19d9a0ec..6bf4faf5a 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -109,7 +109,8 @@ class RaftStateMachine : public nuraft::state_machine { raft_buf_ptr_t pre_commit_ext(const nuraft::state_machine::ext_op_params& params) override; raft_buf_ptr_t commit_ext(const nuraft::state_machine::ext_op_params& params) override; void commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) override; - void rollback(uint64_t lsn, nuraft::buffer&) override { LOGCRITICAL("Unimplemented rollback on: [{}]", lsn); } + void rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) override; + void rollback_ext(const nuraft::state_machine::ext_op_params& params) override; void become_ready(); void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 9ccc40dfc..169fc7f8a 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -152,6 +152,7 @@ TEST_F(RaftReplDevTest, Resync_From_Non_Originator) { } #if 0 + TEST_F(RaftReplDevTest, Leader_Restart) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); @@ -176,7 +177,6 @@ TEST_F(RaftReplDevTest, Leader_Restart) { g_helper->sync_for_cleanup_start(); } - TEST_F(RaftReplDevTest, Drop_Raft_Entry_Switch_Leader) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); From a0b0910b56cee7f073a43d2ce3877b2b89dae779 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 13 Nov 2024 09:36:18 +0800 Subject: [PATCH 021/130] PushData only pushed to active followers. (#584) * PushData only pushed to active followers. If a follower is lagging too far, do not flood it with data from new IOs (new rreq, new LSNs) , reserve the capability for catching up, that follower can request data via FetchData. Signed-off-by: Xiaoxi Chen --- src/lib/common/homestore_config.fbs | 4 ++ .../replication/repl_dev/raft_repl_dev.cpp | 58 ++++++++++++++----- src/lib/replication/repl_dev/raft_repl_dev.h | 1 + 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index cd8858863..708f1475b 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -284,6 +284,10 @@ table Consensus { // Log difference to determine if the follower is in resync mode resync_log_idx_threshold: int64 = 100; + + // Log difference from leader's point of view, to determine if the + // follower is laggy and if so, leader will stop pushing data until it drops under this threshold. + laggy_threshold: int64 = 2000; } table HomeStoreSettings { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index dea117736..2b116f896 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -363,23 +363,30 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list flatbuffers::FlatBufferToString(builder.GetBufferPointer() + sizeof(flatbuffers::uoffset_t), PushDataRequestTypeTable()));*/ - RD_LOGD("Data Channel: Pushing data to all followers: rreq=[{}]", rreq->to_string()); - - group_msg_service() - ->data_service_request_unidirectional(nuraft_mesg::role_regex::ALL, PUSH_DATA, rreq->m_pkts) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, rreq = std::move(rreq)](auto e) { - if (e.hasError()) { - RD_LOGE("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", rreq->to_string(), - e.error()); - handle_error(rreq, RaftReplService::to_repl_error(e.error())); - return; + auto peers = get_active_peers(); + auto calls = std::vector< nuraft_mesg::NullAsyncResult >(); + for (auto peer : peers) { + RD_LOGD("Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string()); + calls.push_back(group_msg_service() + ->data_service_request_unidirectional(peer, PUSH_DATA, rreq->m_pkts) + .via(&folly::InlineExecutor::instance())); + } + folly::collectAllUnsafe(calls).thenValue([this, rreq](auto&& v_res) { + for (auto const& res : v_res) { + if (sisl_likely(res.value())) { + auto r = res.value(); + if (r.hasError()) { + // Just logging PushData error, no action is needed as follower can try by fetchData. + RD_LOGW("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", + rreq->to_string(), r.error()); + } } - // Release the buffer which holds the packets - RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string()); - rreq->release_fb_builder(); - rreq->m_pkts.clear(); - }); + } + RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string()); + // Release the buffer which holds the packets + rreq->release_fb_builder(); + rreq->m_pkts.clear(); + }); } void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { @@ -1039,6 +1046,25 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { return pi; } +std::set< replica_id_t > RaftReplDev::get_active_peers() const { + auto repl_status = get_replication_status(); + std::set< replica_id_t > res; + auto my_committed_idx = m_commit_upto_lsn.load(); + uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) + ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) + : 0; + for (auto p : repl_status) { + if (p.id_ == m_my_repl_id) { continue; } + if (p.replication_idx_ >= least_active_repl_idx) { + res.insert(p.id_); + } else { + RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}", p.id_, + my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_); + } + } + return res; +} + uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); } nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 5cb1516ac..0d5c8b8d8 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -176,6 +176,7 @@ class RaftReplDev : public ReplDev, bool is_leader() const override; replica_id_t get_leader_id() const override; std::vector< peer_info > get_replication_status() const override; + std::set< replica_id_t > get_active_peers() const; group_id_t group_id() const override { return m_group_id; } std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } std::string rdev_name() const { return m_rdev_name; } From 6c748e8d92adbdfb9919078efad14730a8eea3eb Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 13 Nov 2024 09:36:29 +0800 Subject: [PATCH 022/130] Set min_log_gap_to_join to max_int32 and enabled new_joiner_type Signed-off-by: Xiaoxi Chen --- src/lib/common/homestore_config.fbs | 2 +- src/lib/replication/service/raft_repl_service.cpp | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 708f1475b..a335281c5 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -262,7 +262,7 @@ table Consensus { stale_log_gap_lo_threshold: int32 = 30; // Minimum log gap a replica has to be from leader before joining the replica set. - min_log_gap_to_join: int32 = 30; + min_log_gap_to_join: int32 = 2147483647; // amount of time in millis to wait on data write before fetch data from remote; wait_data_write_timer_ms: uint64 = 1500 (hotswap); diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 0469d7829..1ec45d9d0 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -99,18 +99,17 @@ void RaftReplService::start() { .with_hb_interval(HS_DYNAMIC_CONFIG(consensus.heartbeat_period_ms)) .with_max_append_size(HS_DYNAMIC_CONFIG(consensus.max_append_batch_size)) .with_log_sync_batch_size(HS_DYNAMIC_CONFIG(consensus.log_sync_batch_size)) - // TODO to fix the log_gap thresholds when adding new member. - // When the option is enabled, new member is doing log sync is stuck after the first batch - // where if the option is disabled, new member is going through append entries and it works. -#if 0 .with_log_sync_stopping_gap(HS_DYNAMIC_CONFIG(consensus.min_log_gap_to_join)) -#endif .with_stale_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_hi_threshold)) .with_fresh_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_lo_threshold)) .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) .with_leadership_expiry(HS_DYNAMIC_CONFIG(consensus.leadership_expiry_ms)) .with_reserved_log_items(HS_DYNAMIC_CONFIG(consensus.num_reserved_log_items)) .with_auto_forwarding(false); + // new_joiner_type fully disabled log pack behavior. + // There is no callback available for handling and localizing the log entries within the pack, which could + // result in data corruption. + r_params.use_new_joiner_type_ = true; r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); From 7807ccec20a0b5a3b6cd94c44cc720e7119c993f Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 13 Nov 2024 14:32:26 +0800 Subject: [PATCH 023/130] handle nagtive log batch size returned by follower (#588) when follower hits some error before appending log entries, it will set batch_size_hint_in_bytes to -1 to ask leader do not send more log entries in the next append_log_req. https://github.com/eBay/NuRaft/blob/eabdeeda538a27370943f79a2b08b5738b697ac3/src/handle_append_entries.cxx#L760 in nuobject case , if a new member is added to a raft group and it tries to append create_shard log entry , which will try to alllocate block from the chunks of the pg, before the create_pg log is committed , which will allocated chunks to this pg, and error will happen and the log batch containing create_shard log entry will be wholy rejected and set batch_size_hint_in_bytes to -1 in the response to leader. this pr aims to set the log count in the next batch sent to follower to 1, so that: if the create_pg and create_shard are in the same log batch , the pr will first reject this log batch and leader will send only create_pg in the next batch , which will be accepted by follower , since it will only create this pg. if if the create_pg and create_shard are not in the same log batch, and create_shard is trying to allocate block before the pg it created(chunks of this pg is alllocated), then , with this pr, follower will reject this batch so that it will give more time to creating pg. create_shard log will be resent in the next batch , and at that moment pg has probably already been successfully be created. --- .../log_store/home_raft_log_store.cpp | 16 +++++++++++--- .../log_store/home_raft_log_store.h | 22 +++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index dc878924d..5bf676849 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -190,9 +190,7 @@ void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& e // remove all cached entries after this index for (size_t i{0}; i < m_log_entry_cache.size(); ++i) { - if (m_log_entry_cache[i].first > index) { - m_log_entry_cache[i] = std::make_pair(0, nullptr); - } + if (m_log_entry_cache[i].first > index) { m_log_entry_cache[i] = std::make_pair(0, nullptr); } } } @@ -221,6 +219,18 @@ nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore: return out_vec; } +nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > +HomeRaftLogStore::log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes) { + // in nuraft , batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more + // log entries ATM. here we just send one log entry if this happens which is helpful for nuobject case and no harm + // to other case. + if (batch_size_hint_in_bytes < 0) end = start + 1; + + // for the case where batch_size_hint_in_bytes >= 0, we do not take any size check here for now. + // TODO: limit the size of the returned entries by batch_size_hint_in_bytes int the future if necessary + return log_entries(start, end); +} + nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { auto positio_in_cache = index % m_log_entry_cache.size(); { diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index ccf46ef92..3c4c021ef 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -99,12 +99,34 @@ class HomeRaftLogStore : public nuraft::log_store { /** * Get log entries with index [start, end). * + * Return nullptr to indicate error if any log entry within the requested range + * could not be retrieved (e.g. due to external log truncation). + * * @param start The start log index number (inclusive). * @param end The end log index number (exclusive). * @return The log entries between [start, end). */ virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > log_entries(ulong start, ulong end) override; + /** + * Get log entries with index [start, end). + * + * The total size of the returned entries is limited by batch_size_hint. + * + * Return nullptr to indicate error if any log entry within the requested range + * could not be retrieved (e.g. due to external log truncation). + * + * @param start The start log index number (inclusive). + * @param end The end log index number (exclusive). + * @param batch_size_hint_in_bytes Total size (in bytes) of the returned entries, + * see the detailed comment at + * `state_machine::get_next_batch_size_hint_in_bytes()`. + * @return The log entries between [start, end) and limited by the total size + * given by the batch_size_hint_in_bytes. + */ + virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > + log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes = 0) override; + /** * Get the log entry at the specified log index number. * From dee5fed4a25d519188b372c81d140ac09d3e7a55 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Mon, 11 Nov 2024 15:14:57 +0800 Subject: [PATCH 024/130] Checking received data size and reject if not match. We dont need to panic in this case, fetchData can handle this. Signed-off-by: Xiaoxi Chen --- src/lib/replication/repl_dev/raft_repl_dev.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2b116f896..6cf4411dd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -401,8 +401,12 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto const fb_size = flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t); auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes()); - HS_DBG_ASSERT_EQ(fb_size + push_req->data_size(), incoming_buf.size(), "Size mismatch of data size vs buffer size"); - + if (fb_size + push_req->data_size() != incoming_buf.size()) { + RD_LOGW("Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}", + fb_size, push_req->data_size(), incoming_buf.size()); + rpc_data->send_response(); + return; + } sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()}; sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()}; repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()}; From f7adb1e7f2b7b303b4bbee2ddf4c907afc33b5d4 Mon Sep 17 00:00:00 2001 From: Hooper <62418134+Hooper9973@users.noreply.github.com> Date: Thu, 14 Nov 2024 17:47:02 +0800 Subject: [PATCH 025/130] Add application_hint into blk_alloc_hints (#591) Add application_hint to the blk_alloc_hints structure. This change addresses the need for certain users of homestore, such as homeobject, to pass additional hints. The application_hint can be used to specify behavior in the select_chunk interface. --- src/include/homestore/blk.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index b9e22740c..5ee5c06ee 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -251,9 +251,10 @@ VENUM(BlkAllocStatus, uint32_t, struct blk_alloc_hints { blk_temp_t desired_temp{0}; // Temperature hint for the device - std::optional< uint32_t > pdev_id_hint{std::nullopt}; // which physical device to pick (hint if any) - std::optional< chunk_num_t > chunk_id_hint{std::nullopt}; // any specific chunk id to pick for this allocation - std::optional< stream_id_t > stream_id_hint{std::nullopt}; // any specific stream to pick + std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care + std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation + std::optional< stream_id_t > stream_id_hint; // any specific stream to pick + std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device bool is_contiguous{true}; // Should the entire allocation be one contiguous block bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous From a5d6d9b1a5348b70c468b004eed841925b27ec03 Mon Sep 17 00:00:00 2001 From: Sanal Date: Mon, 18 Nov 2024 09:57:46 -0800 Subject: [PATCH 026/130] Disable dynamic repl ut temporarily. (#593) --- src/tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index ff6e9296a..6332fd294 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -126,7 +126,7 @@ if (${io_tests}) add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) - add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) + # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() @@ -139,7 +139,7 @@ if (${io_tests}) add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") - add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") + # add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") if(${epoll_tests}) SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) From aebbe927ffbf98fcb5dc3e0c8a3e817ada279858 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Mon, 25 Nov 2024 15:59:59 +0800 Subject: [PATCH 027/130] handle RemovedFromCluster event (#594) 1 consume nuraft::cb_func::Type::RemovedFromCluster callback 2 add reset function to allocator/vchunk as a preparation for implementing m_listener->on_destroy() --- src/include/homestore/replication/repl_dev.h | 9 +++-- src/include/homestore/vchunk.h | 1 + src/lib/blkalloc/append_blk_allocator.cpp | 7 ++++ src/lib/blkalloc/append_blk_allocator.h | 33 +++++++++++-------- src/lib/blkalloc/bitmap_blk_allocator.h | 1 + src/lib/blkalloc/blk_allocator.h | 1 + src/lib/blkalloc/fixed_blk_allocator.h | 1 + src/lib/blkalloc/varsize_blk_allocator.h | 1 + src/lib/device/vchunk.cpp | 2 ++ .../replication/repl_dev/raft_repl_dev.cpp | 32 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 6 ++-- .../repl_dev/raft_state_machine.cpp | 30 ++++++++++++++--- src/lib/replication/repl_dev/solo_repl_dev.h | 2 ++ src/tests/test_common/raft_repl_test_base.hpp | 6 ++-- src/tests/test_solo_repl_dev.cpp | 2 +- 15 files changed, 100 insertions(+), 34 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index cf0e00a0c..20e9a170f 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -144,7 +144,8 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: MultiBlkId const& local_blkid() const { return m_local_blkid; } RemoteBlkId const& remote_blkid() const { return m_remote_blkid; } const char* data() const { - DEBUG_ASSERT(m_data != nullptr, "m_data is nullptr, use before save_pushed/fetched_data or after release_data()"); + DEBUG_ASSERT(m_data != nullptr, + "m_data is nullptr, use before save_pushed/fetched_data or after release_data()"); return r_cast< const char* >(m_data); } repl_req_state_t state() const { return repl_req_state_t(m_state.load()); } @@ -349,7 +350,7 @@ class ReplDevListener { /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources. /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called /// after restart in case crash happened during the destroy. - virtual void on_destroy() = 0; + virtual void on_destroy(const group_id_t& group_id) = 0; /// @brief Called when replace member is performed. virtual void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) = 0; @@ -450,6 +451,10 @@ class ReplDev { /// @return Block size virtual uint32_t get_blk_size() const = 0; + /// @brief Gets the last commit lsn of this repldev + /// @return last_commit_lsn + virtual repl_lsn_t get_last_commit_lsn() const = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index 0406d428f..4b69b1332 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -36,6 +36,7 @@ class VChunk { uint16_t get_chunk_id() const; cshared< Chunk > get_internal_chunk() const; uint64_t size() const; + void reset(); private: shared< Chunk > m_internal_chunk; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 4a4c7fd18..1380a5ff6 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -162,6 +162,13 @@ bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const { return in_bid.blk_num() < get_used_blks(); } +void AppendBlkAllocator::reset() { + m_last_append_offset.store(0); + m_freeable_nblks.store(0); + m_commit_offset.store(0); + m_is_dirty.store(true); +} + bool AppendBlkAllocator::is_blk_alloced_on_disk(BlkId const& bid, bool) const { return bid.blk_num() < m_sb->commit_offset; } diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h index 384a4936b..5e745c33a 100644 --- a/src/lib/blkalloc/append_blk_allocator.h +++ b/src/lib/blkalloc/append_blk_allocator.h @@ -38,21 +38,21 @@ struct append_blk_sb_t { }; #pragma pack() -//class AppendBlkAllocMetrics : public sisl::MetricsGroup { -//public: -// explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) { -// REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts"); -// REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures"); +// class AppendBlkAllocMetrics : public sisl::MetricsGroup { +// public: +// explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) { +// REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts"); +// REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures"); // -// register_me_to_farm(); -// } +// register_me_to_farm(); +// } // -// AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete; -// AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete; -// AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete; -// AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete; -// ~AppendBlkAllocMetrics() { deregister_me_from_farm(); } -//}; +// AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete; +// AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete; +// AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete; +// AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete; +// ~AppendBlkAllocMetrics() { deregister_me_from_farm(); } +// }; // // The assumption for AppendBlkAllocator: @@ -108,6 +108,11 @@ class AppendBlkAllocator : public BlkAllocator { std::string to_string() const override; + /** + * @brief : reset the allocator to initial state, so all the blks in this chunk are free. + */ + void reset() override; + void cp_flush(CP* cp) override; void recovery_completed() override {} nlohmann::json get_status(int log_level) const override; @@ -121,7 +126,7 @@ class AppendBlkAllocator : public BlkAllocator { std::atomic< blk_num_t > m_freeable_nblks{0}; // count of blks fragmentedly freed (both on-disk and in-memory) std::atomic< blk_num_t > m_commit_offset{0}; // offset in on-disk version std::atomic< bool > m_is_dirty{false}; - //AppendBlkAllocMetrics m_metrics; + // AppendBlkAllocMetrics m_metrics; superblk< append_blk_sb_t > m_sb; // only cp will be writing to this disk }; diff --git a/src/lib/blkalloc/bitmap_blk_allocator.h b/src/lib/blkalloc/bitmap_blk_allocator.h index 381767bef..a86e08757 100644 --- a/src/lib/blkalloc/bitmap_blk_allocator.h +++ b/src/lib/blkalloc/bitmap_blk_allocator.h @@ -77,6 +77,7 @@ class BitmapBlkAllocator : public BlkAllocator { void cp_flush(CP* cp) override; void recovery_completed() override {} + void reset() override {} blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } blk_num_t get_blks_per_portion() const { return m_blks_per_portion; } diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h index b381f71c5..8c64fc8e5 100644 --- a/src/lib/blkalloc/blk_allocator.h +++ b/src/lib/blkalloc/blk_allocator.h @@ -158,6 +158,7 @@ class BlkAllocator { virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0; virtual bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const = 0; virtual void recovery_completed() = 0; + virtual void reset() = 0; virtual std::string to_string() const = 0; virtual void cp_flush(CP* cp) = 0; diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h index fa28681f2..01f1e1138 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.h +++ b/src/lib/blkalloc/fixed_blk_allocator.h @@ -41,6 +41,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator { blk_num_t available_blks() const override; blk_num_t get_used_blks() const override; blk_num_t get_defrag_nblks() const override; + void reset() override{}; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h index 1a90de8da..03a507b03 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.h +++ b/src/lib/blkalloc/varsize_blk_allocator.h @@ -222,6 +222,7 @@ class VarsizeBlkAllocator : public BitmapBlkAllocator { blk_num_t get_used_blks() const override; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; + void reset() override{}; nlohmann::json get_metrics_in_json(); private: diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index 26391ac1b..a809450d1 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -25,6 +25,8 @@ const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_ blk_num_t VChunk::get_total_blks() const { return m_internal_chunk->blk_allocator()->get_total_blks(); } +void VChunk::reset() { m_internal_chunk->blk_allocator_mutable()->reset(); } + blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } blk_num_t VChunk::get_defrag_nblks() const { return m_internal_chunk->blk_allocator()->get_defrag_nblks(); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 6cf4411dd..7b4a407cb 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -914,8 +914,7 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { - HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", - blkid.to_string()); + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string()); RD_LOGD("Rollback rreq: Releasing blkid={} freed successfully", blkid.to_string()); }); } @@ -1212,7 +1211,7 @@ void RaftReplDev::leave() { // We let the listener know right away, so that they can cleanup persistent structures soonest. This will // reduce the time window of leaked resources if any - m_listener->on_destroy(); + m_listener->on_destroy(group_id()); // Persist that destroy pending in superblk, so that in case of crash before cleanup of resources, it can be done // post restart. @@ -1227,7 +1226,8 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu nuraft::cb_func::Param* param) { auto ret = nuraft::cb_func::ReturnCode::Ok; - if (type == nuraft::cb_func::Type::GotAppendEntryReqFromLeader) { + switch (type) { + case nuraft::cb_func::Type::GotAppendEntryReqFromLeader: { auto raft_req = r_cast< nuraft::req_msg* >(param->ctx); auto const& entries = raft_req->log_entries(); @@ -1276,9 +1276,29 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu sisl::VectorPool< repl_req_ptr_t >::free(reqs); } return {true, ret}; - } else { - return {false, ret}; } + + case nuraft::cb_func::Type::RemovedFromCluster: { + // a node will reach here when : + // 1. it is removed from the cluster and the new config(excluding this node) is being committed on this node + // 2. it is removed from the cluster , but the node is down and new config log(excluding this node) is not + // replicated to this removed node. when the node restart, leader will not send any append entry to this node, + // since it is not a member of the raft group. it will become a condidate and send request-vote request to other + // members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node + // is no longer a member of the raft group. + + // this will lazily cleanup the group + // TODO:cleanup this repl dev ASAP if necessary. + leave(); + + return {true, ret}; + } + + // TODO: Add more type handler if necessary + default: + break; + } + return {false, ret}; } void RaftReplDev::flush_durable_commit_lsn() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0d5c8b8d8..2bf7cc52c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -182,7 +182,7 @@ class RaftReplDev : public ReplDev, std::string rdev_name() const { return m_rdev_name; } std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } uint32_t get_blk_size() const override; - repl_lsn_t get_last_commit_lsn() const { return m_commit_upto_lsn.load(); } + repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } bool is_destroy_pending() const; bool is_destroyed() const; @@ -229,9 +229,7 @@ class RaftReplDev : public ReplDev, * * @param num_reserved_entries The number of reserved entries of the replication log. */ - void truncate(uint32_t num_reserved_entries) { - m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); - } + void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); } void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 39df73a0e..957134187 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -202,8 +202,32 @@ raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params } void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) { + // when reaching here, the config change log has already been committed, and the new config has been applied to the + // cluster + RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx); - // TODO:add more logic here if necessary + +#ifdef _PRERELEASE + auto& servers_in_new_conf = new_conf->get_servers(); + std::vector< int32_t > server_ids_in_new_conf; + for (auto& server : servers_in_new_conf) + server_ids_in_new_conf.emplace_back(server->get_id()); + + auto my_id = m_rd.server_id(); + + std::ostringstream oss; + auto it = server_ids_in_new_conf.begin(); + if (it != server_ids_in_new_conf.end()) { + oss << *it; + ++it; + } + for (; it != server_ids_in_new_conf.end(); ++it) { + oss << "," << *it; + } + + RD_LOG(INFO, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, + m_rd.group_id_str()); +#endif } void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { @@ -240,9 +264,7 @@ void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) { // it is possible a LSN mapped to different rreq in history // due to log overwritten. Verify the rreq before removing auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq); - if (deleted) { - RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); - } + if (deleted) { RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } } void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index cddb94856..911f4bd28 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -56,6 +56,8 @@ class SoloReplDev : public ReplDev { uuid_t group_id() const override { return m_group_id; } + repl_lsn_t get_last_commit_lsn() const override { return 0; } + uint32_t get_blk_size() const override; void cp_flush(CP* cp); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 1ab90143a..889ab72bb 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -306,10 +306,10 @@ class TestReplicatedDB : public homestore::ReplDevListener { boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } - void on_destroy() override { + void on_destroy(const group_id_t& group_id) override { LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - g_helper->unregister_listener(repl_dev()->group_id()); + boost::uuids::to_string(group_id)); + g_helper->unregister_listener(group_id); } void db_write(uint64_t data_size, uint32_t max_size_per_iov) { diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index e446c3cd5..f4365f651 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -136,7 +136,7 @@ class SoloReplDevTest : public testing::Test { LOGINFO("Received error={} on repl_dev", enum_name(error)); } void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} - void on_destroy() override {} + void on_destroy(const group_id_t& group_id) override {} }; class Application : public ReplApplication { From 58882a29ac2e179f45f141f2c523722d81d0aae3 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Tue, 26 Nov 2024 10:54:08 +0800 Subject: [PATCH 028/130] Fix grpc crash (#595) * release data before set m_data_written_promise authored-by: yawzhang --- src/lib/replication/repl_dev/common.cpp | 5 ++++- src/lib/replication/repl_dev/raft_repl_dev.cpp | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 4fcbb0f4e..1c2a8c560 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -164,15 +164,18 @@ bool repl_req_ctx::add_state_if_not_already(repl_req_state_t s) { void repl_req_ctx::clear() { m_header = sisl::blob{}; m_key = sisl::blob{}; - release_data(); m_pkts.clear(); } +// FIXME: Use lock to avoid concurrent release of data. void repl_req_ctx::release_data() { m_data = nullptr; // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { + LOGTRACEMOD(replication, "m_pushed_data addr={}, m_rkey={}, m_lsn={}", + static_cast(m_pushed_data.get()), + m_rkey.to_string(), m_lsn); m_pushed_data->send_response(); m_pushed_data = nullptr; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 7b4a407cb..2d93c4070 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -453,9 +453,9 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d RD_DBG_ASSERT(false, "Error in writing data, error_code={}", err.value()); handle_error(rreq, ReplServiceError::DRIVE_WRITE_ERROR); } else { + rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); - rreq->release_data(); const auto data_log_diff_us = push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count() ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time) @@ -872,9 +872,9 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener + rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); - rreq->release_data(); RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " "total_write_latency_us={}, write_num_pieces={}", From e5bb0f7475402544c51a14d5bf0666d10a7d3ca7 Mon Sep 17 00:00:00 2001 From: Hooper Date: Tue, 26 Nov 2024 10:54:41 +0800 Subject: [PATCH 029/130] Support flexible virtual device creation in `homestore::BlkDataService` with num_chunks or chunk_size. Prioritize `num_chunks` over `chunk_size` if both are provided. --- src/include/homestore/blkdata_service.hpp | 8 +++++--- src/lib/blkdata_svc/blkdata_service.cpp | 3 ++- src/lib/homestore.cpp | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index b82ec886b..fff670f44 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -56,17 +56,19 @@ class BlkDataService { /** * @brief Creates a new virtual device with the specified size and block size, using the specified - * block allocator and chunk selector types. The virtual device will be composed of the specified - * number of chunks. + * block allocator and chunk selector types. The virtual device will be composed of a number of chunks. + * Either `num_chunks` or `chunk_size` must be specified. + * Prioritize `num_chunks` over `chunk_size` if both are provided. * * @param size The size of the virtual device, in bytes. * @param blk_size The size of each block in the virtual device, in bytes. * @param alloc_type The type of block allocator to use for the virtual device. * @param chunk_sel_type The type of chunk selector to use for the virtual device. * @param num_chunks The number of chunks to use for the virtual device. + * @param chunk_size The size of chunks to use for the virtual device, in bytes. */ void create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type, - chunk_selector_type_t chunk_sel_type, uint32_t num_chunks); + chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size); /** * @brief Opens a virtual device with the specified virtual device information. diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 4acd3d846..5e80ac7e0 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -38,7 +38,7 @@ BlkDataService::~BlkDataService() = default; // first-time boot path void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type, - chunk_selector_type_t chunk_sel_type, uint32_t num_chunks) { + chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size) { hs_vdev_context vdev_ctx; vdev_ctx.type = hs_vdev_type_t::DATA_VDEV; @@ -48,6 +48,7 @@ void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_ .vdev_size = size, .num_chunks = num_chunks, .blk_size = blk_size, + .chunk_size = chunk_size, .dev_type = devType, .alloc_type = alloc_type, .chunk_sel_type = chunk_sel_type, diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index c04ff23bf..85ca4aa9b 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -251,11 +251,11 @@ void HomeStore::format_and_start(std::map< ServiceId, hs_format_params >&& forma } else if ((svc_id.type == ServiceType::DATA) && has_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, - fparams.num_chunks); + fparams.num_chunks, fparams.chunk_size); } else if ((svc_id.type == ServiceType::REPLICATION) && has_repl_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, - fparams.num_chunks); + fparams.num_chunks, fparams.chunk_size); } } From 112c7dd22a2fe77d436074d035ffb3627f1acd8f Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Wed, 4 Dec 2024 10:06:59 +0800 Subject: [PATCH 030/130] Support Baseline Resync (#596) * Support Baseline resync For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer. We use the first bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application. In the HomeStore layer, leader needs to transmit the DSN to the follower, this is intended to handle the following case: 1. Leader sends snapshot at LSN T1 to follower F1. 2. F1 fully receives the snapshot and now at T1. 3. Leader yield its leadership, F1 elected as leader. In this sequence the incremental resync will not kicked in to update the m_next_dsn, and as result, duplication may occur. --- src/include/homestore/replication/repl_dev.h | 27 ++++++--- .../replication/repl_dev/raft_repl_dev.cpp | 36 ++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 2 + .../repl_dev/raft_state_machine.cpp | 29 ++++++++-- .../replication/repl_dev/raft_state_machine.h | 6 ++ src/tests/test_common/raft_repl_test_base.hpp | 58 +++++++++++++------ src/tests/test_solo_repl_dev.cpp | 4 +- 7 files changed, 130 insertions(+), 32 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 20e9a170f..335cda834 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -46,6 +46,10 @@ VENUM(journal_type_t, uint16_t, HS_CTRL_REPLACE = 3, // Control message to replace a member ) +// magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum' +static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327; +static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01; + struct repl_key { int32_t server_id{0}; // Server Id which this req is originated from uint64_t term; // RAFT term number @@ -112,14 +116,23 @@ class nuraft_snapshot_context : public snapshot_context { nuraft::ptr< nuraft::snapshot > snapshot_; }; -struct snapshot_data { +struct snapshot_obj { void* user_ctx{nullptr}; - int64_t offset{0}; + uint64_t offset{0}; sisl::io_blob_safe blob; bool is_first_obj{false}; bool is_last_obj{false}; }; +//HomeStore has some meta information to be transmitted during the baseline resync, +//Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data if needed in the future. +struct snp_repl_dev_data { + uint64_t magic_num{HOMESTORE_RESYNC_DATA_MAGIC}; + uint32_t protocol_version{HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1}; + uint32_t crc{0}; + uint64_t dsn{0}; +}; + struct repl_journal_entry; struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::thread_safe_counter >, sisl::ObjLifeCounter< repl_req_ctx > { @@ -368,16 +381,16 @@ class ReplDevListener { /// uses offset given by the follower to the know the current state of the follower. /// Leader sends the snapshot data to the follower in batch. This callback is called multiple /// times on the leader till all the data is transferred to the follower. is_last_obj in - /// snapshot_data will be true once all the data has been trasnferred. After this the raft on + /// snapshot_obj will be true once all the data has been trasnferred. After this the raft on /// the follower side can do the incremental resync. - virtual int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0; + virtual int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0; /// @brief Called on the follower when the leader sends the data during the baseline resyc. - /// is_last_obj in in snapshot_data will be true once all the data has been transfered. + /// is_last_obj in in snapshot_obj will be true once all the data has been transfered. /// After this the raft on the follower side can do the incremental resync. - virtual void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0; + virtual void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0; - /// @brief Free up user-defined context inside the snapshot_data that is allocated during read_snapshot_data. + /// @brief Free up user-defined context inside the snapshot_obj that is allocated during read_snapshot_obj. virtual void free_user_snp_ctx(void*& user_snp_ctx) = 0; private: diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2d93c4070..72a39a27a 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1491,6 +1491,42 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx handle_commit(rreq, true /* recovery */); } +void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { + snp_repl_dev_data msg; + auto msg_size = sizeof(snp_repl_dev_data); + msg.dsn = m_next_dsn; + auto crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(&msg), msg_size); + RD_LOGD("create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc); + msg.crc = crc; + data_out = nuraft::buffer::alloc(msg_size); + std::memcpy(data_out->data_begin(), &msg, msg_size); +} + +bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { + auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); + if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != + HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { + RD_LOGE("Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, msg->protocol_version); + return false; + } + auto received_crc = msg->crc; + RD_LOGD("received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, received_crc); + // Clear the crc field before verification, because the crc value computed by leader doesn't contain it. + msg->crc = 0; + auto computed_crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), + sizeof(snp_repl_dev_data)); + if (received_crc != computed_crc) { + RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); + return false; + } + if (msg->dsn > m_next_dsn) { + m_next_dsn = msg->dsn; + RD_LOGD("Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); + return true; + } + return true; +} + void RaftReplDev::on_restart() { m_listener->on_restart(); } bool RaftReplDev::is_resync_mode() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 2bf7cc52c..0550858cf 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -285,6 +285,8 @@ class RaftReplDev : public ReplDev, void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); + void create_snp_resync_data(raft_buf_ptr_t& data_out); + bool apply_snp_resync_data(nuraft::buffer& data); }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 957134187..be5503e4c 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -295,14 +295,22 @@ void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, bool& is_last_obj) { + // For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer. + // We use the highest bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application. + if (is_hs_snp_obj(obj_id)) { + // This is the preserved msg for homestore to resync data + m_rd.create_snp_resync_data(data_out); + is_last_obj = false; + return 0; + } auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - auto snp_data = std::make_shared< snapshot_data >(); + auto snp_data = std::make_shared< snapshot_obj >(); snp_data->user_ctx = user_ctx; snp_data->offset = obj_id; snp_data->is_last_obj = is_last_obj; // Listener will read the snapshot data and we pass through the same. - int ret = m_rd.m_listener->read_snapshot_data(snp_ctx, snp_data); + int ret = m_rd.m_listener->read_snapshot_obj(snp_ctx, snp_data); if (ret < 0) return ret; // Update user_ctx and whether is_last_obj @@ -318,8 +326,16 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, nuraft::buffer& data, bool is_first_obj, bool is_last_obj) { + if (is_hs_snp_obj(obj_id)) { + // Homestore preserved msg + if (m_rd.apply_snp_resync_data(data)) { + obj_id = snp_obj_id_type_app; + LOGDEBUG("apply_snp_resync_data success, next obj_id={}", obj_id); + } + return; + } auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - auto snp_data = std::make_shared< snapshot_data >(); + auto snp_data = std::make_shared< snapshot_obj >(); snp_data->offset = obj_id; snp_data->is_first_obj = is_first_obj; snp_data->is_last_obj = is_last_obj; @@ -329,7 +345,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, std::memcpy(blob.bytes(), data.data_begin(), data.size()); snp_data->blob = std::move(blob); - m_rd.m_listener->write_snapshot_data(snp_ctx, snp_data); + m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data); // Update the object offset. obj_id = snp_data->offset; @@ -347,7 +363,10 @@ bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { m_rd.set_last_commit_lsn(s.get_last_log_idx()); m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx()); auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - return m_rd.m_listener->apply_snapshot(snp_ctx); + auto res = m_rd.m_listener->apply_snapshot(snp_ctx); + //make sure the changes are flushed. + hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); + return res; } nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() { diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 6bf4faf5a..8f00cec43 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -86,6 +86,10 @@ class StateMachineStore; #define RD_LOGE(...) RD_LOG(ERROR, ##__VA_ARGS__) #define RD_LOGC(...) RD_LOG(CRITICAL, ##__VA_ARGS__) +// For the logic snapshot obj_id, we use the highest bit to indicate the type of the snapshot message. +// 0 is for HS, 1 is for Application. +static constexpr uint64_t snp_obj_id_type_app = 1ULL << 63; + using AsyncNotify = folly::SemiFuture< folly::Unit >; using AsyncNotifier = folly::Promise< folly::Unit >; @@ -135,6 +139,8 @@ class RaftStateMachine : public nuraft::state_machine { std::string rdev_name() const; + static bool is_hs_snp_obj(uint64_t obj_id) { return (obj_id & snp_obj_id_type_app) == 0; } + private: void after_precommit_in_leader(const nuraft::raft_server::req_ext_cb_params& params); }; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 889ab72bb..7445568b8 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -182,10 +182,26 @@ class TestReplicatedDB : public homestore::ReplDevListener { return make_async_success<>(); } - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + static int64_t get_next_lsn(uint64_t& obj_id) { + return obj_id & ((1ULL << 63) - 1); + } + static void set_resync_msg_type_bit(uint64_t& obj_id) { + obj_id |= 1ULL << 63; + } + + int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + if(RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return -1; + } + if ((snp_data->offset & snp_obj_id_type_app) == 0) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return -1; + } - if (snp_data->offset == 0) { + int64_t next_lsn = get_next_lsn(snp_data->offset); + if (next_lsn == 0) { snp_data->is_last_obj = false; snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); LOGINFOMOD(replication, @@ -194,38 +210,37 @@ class TestReplicatedDB : public homestore::ReplDevListener { return 0; } - int64_t next_lsn = snp_data->offset; - std::vector< KeyValuePair > kv_snapshot_data; + std::vector< KeyValuePair > kv_snapshot_obj; // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the // first element to be read and transfered. for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { auto& v = iter->second; - kv_snapshot_data.emplace_back(Key{v.id_}, v); + kv_snapshot_obj.emplace_back(Key{v.id_}, v); LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); - if (kv_snapshot_data.size() >= 10) { break; } + if (kv_snapshot_obj.size() >= 10) { break; } } - if (kv_snapshot_data.size() == 0) { + if (kv_snapshot_obj.size() == 0) { snp_data->is_last_obj = true; LOGINFOMOD(replication, "Snapshot is_last_obj is true"); return 0; } - int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size(); - sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)}; - std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size); + int64_t kv_snapshot_obj_size = sizeof(KeyValuePair) * kv_snapshot_obj.size(); + sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_obj_size)}; + std::memcpy(blob.bytes(), kv_snapshot_obj.data(), kv_snapshot_obj_size); snp_data->blob = std::move(blob); snp_data->is_last_obj = false; LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - kv_snapshot_data.size()); + kv_snapshot_obj.size()); return 0; } - void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { + void snapshot_obj_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); @@ -235,21 +250,27 @@ class TestReplicatedDB : public homestore::ReplDevListener { } } - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { + if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return; + } + int64_t next_lsn = get_next_lsn(snp_data->offset); auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); auto last_committed_idx = std::dynamic_pointer_cast< RaftReplDev >(repl_dev())->raft_server()->get_committed_log_idx(); - if (snp_data->offset == 0) { + if (next_lsn == 0) { snp_data->offset = last_committed_lsn + 1; + set_resync_msg_type_bit(snp_data->offset); LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", g_helper->replica_num(), snp_data->offset); return; } - size_t kv_snapshot_data_size = snp_data->blob.size(); - if (kv_snapshot_data_size == 0) return; + size_t kv_snapshot_obj_size = snp_data->blob.size(); + if (kv_snapshot_obj_size == 0) return; - size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair); + size_t num_items = kv_snapshot_obj_size / sizeof(KeyValuePair); std::unique_lock lk(db_mtx_); auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); for (size_t i = 0; i < num_items; i++) { @@ -261,7 +282,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { // Write to data service and inmem map. MultiBlkId out_blkids; if (value.data_size_ != 0) { - snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids); + snapshot_obj_write(value.data_size_, value.data_pattern_, out_blkids); value.blkid_ = out_blkids; } inmem_db_.insert_or_assign(key, value); @@ -271,6 +292,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { } snp_data->offset = last_committed_lsn + 1; + set_resync_msg_type_bit(snp_data->offset); LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index f4365f651..86efbcc03 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -110,10 +110,10 @@ class SoloReplDevTest : public testing::Test { AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { return make_async_success<>(); } - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { return 0; } - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override {} + void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {} bool apply_snapshot(shared< snapshot_context > context) override { return true; } shared< snapshot_context > last_snapshot() override { return nullptr; } void free_user_snp_ctx(void*& user_snp_ctx) override {} From 68c7da0cda65f3d36c291d84510a102ad57b809e Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Tue, 3 Dec 2024 12:37:53 +0800 Subject: [PATCH 031/130] Implement get_next_batch_size_hint_in_bytes() we use the `byte` as `cnt` as of now. Also update the log_entries_ext() which will be called on leader, if hint < 0 means follower want nothing, return an empty vector so that an empty append_entries_req will be sent, to carry the commit_index update and trigger the follower to commit. if hint > 0, respect the cnt that the follower want, this is useful when two logs within same batch has dependency, we can exclude the dependent one. if hint = 0 means control by leader. Signed-off-by: Xiaoxi Chen --- .../log_store/home_raft_log_store.cpp | 29 ++++++++++++++----- .../replication/repl_dev/raft_repl_dev.cpp | 8 +++++ .../repl_dev/raft_state_machine.cpp | 19 ++++++++++++ .../replication/repl_dev/raft_state_machine.h | 4 +++ 4 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 5bf676849..1e402ac94 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -221,14 +221,27 @@ nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore: nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore::log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes) { - // in nuraft , batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more - // log entries ATM. here we just send one log entry if this happens which is helpful for nuobject case and no harm - // to other case. - if (batch_size_hint_in_bytes < 0) end = start + 1; - - // for the case where batch_size_hint_in_bytes >= 0, we do not take any size check here for now. - // TODO: limit the size of the returned entries by batch_size_hint_in_bytes int the future if necessary - return log_entries(start, end); + // WARNING: we interpret batch_size_hint_in_bytes as count as of now. + auto batch_size_hint_cnt = batch_size_hint_in_bytes; + auto new_end = end; + // batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more log entry. + if (batch_size_hint_cnt < 0) + new_end = start; + else if (batch_size_hint_cnt > 0) { + // limit to the hint, also prevent overflow by a huge batch_size_hint_cnt + if (sisl_unlikely(start + (uint64_t)batch_size_hint_cnt < start)) { + new_end = end; + } else { + new_end = start + (uint64_t)batch_size_hint_cnt; + } + // limit to original end + new_end = std::min(new_end, end); + } + DEBUG_ASSERT(new_end <= end, "new end {} should be <= original end {}", new_end, end); + DEBUG_ASSERT(start <= new_end, "start {} should be <= new_end {}", start, new_end); + REPL_STORE_LOG(TRACE, "log_entries_ext, start={} end={}, hint {}, adjusted range {} ~ {}, cnt {}", start, end, + batch_size_hint_cnt, start, new_end, new_end - start); + return log_entries(start, new_end); } nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 72a39a27a..4be1aa78e 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1261,6 +1261,13 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu auto req = m_state_machine->localize_journal_entry_prepare(*entry); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); + // The hint set here will be used by the next after next appendEntry, the next one + // always go with -1 from NuRraft code. + // + // We are rejecting this log entry, meaning we can accept previous log entries. + // If there is nothing we can accept(i==0), that maens we are waiting for commit + // of previous lsn, set it to 1 in this case. + m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); return {true, nuraft::cb_func::ReturnCode::ReturnNull}; } reqs->emplace_back(std::move(req)); @@ -1275,6 +1282,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu } sisl::VectorPool< repl_req_ptr_t >::free(reqs); } + if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); } return {true, ret}; } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index be5503e4c..f6d76505d 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -247,6 +247,25 @@ void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& m_rd.handle_rollback(rreq); } +int64_t RaftStateMachine::get_next_batch_size_hint_in_bytes() { return next_batch_size_hint; } + +int64_t RaftStateMachine::inc_next_batch_size_hint() { + constexpr int64_t next_batch_size_hint_limit = 16; + // set to minimal if previous hint is negative (i.e do not want any log) + if (next_batch_size_hint < 0) { + next_batch_size_hint = 1; + return next_batch_size_hint; + } + // Exponential growth till next_batch_size_hint_limit, set to 0 afterward means leader take control. + next_batch_size_hint = next_batch_size_hint * 2 > next_batch_size_hint_limit ? 0 : next_batch_size_hint * 2; + return next_batch_size_hint; +} + +int64_t RaftStateMachine::reset_next_batch_size_hint(int64_t new_hint) { + next_batch_size_hint = new_hint; + return next_batch_size_hint; +} + void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb) { for (auto [key, rreq] : m_lsn_req_map) { cb(key, rreq); diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 8f00cec43..2b50fea7b 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -101,6 +101,7 @@ class RaftStateMachine : public nuraft::state_machine { nuraft::ptr< nuraft::buffer > m_success_ptr; // Preallocate the success return to raft // iomgr::timer_handle_t m_wait_blkid_write_timer_hdl{iomgr::null_timer_handle}; bool m_resync_mode{false}; + int64_t next_batch_size_hint{0}; public: RaftStateMachine(RaftReplDev& rd); @@ -116,6 +117,7 @@ class RaftStateMachine : public nuraft::state_machine { void rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) override; void rollback_ext(const nuraft::state_machine::ext_op_params& params) override; void become_ready(); + int64_t get_next_batch_size_hint_in_bytes() override; void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override; int read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, @@ -138,6 +140,8 @@ class RaftStateMachine : public nuraft::state_machine { void iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb); std::string rdev_name() const; + int64_t reset_next_batch_size_hint(int64_t new_hint); + int64_t inc_next_batch_size_hint(); static bool is_hs_snp_obj(uint64_t obj_id) { return (obj_id & snp_obj_id_type_app) == 0; } From 8dfc90cd8e822d883e311cfdfe85aa908ad23d4b Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 4 Dec 2024 16:52:32 -0700 Subject: [PATCH 032/130] Bump up nuraft_mesg to >=3.7 Nuraft_mesg(<3.7) do not ship batch size hint. c.f https://github.com/eBay/nuraft_mesg/pull/111 Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index ab86cc420..d1aa519eb 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "5.2.2" + version = "6.5.21" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From 512f3035547619426d33052cd0e1e0f1f7478a47 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sat, 7 Dec 2024 11:43:07 -0700 Subject: [PATCH 033/130] Add on_repl_devs_init_completed cb. A stable callback is needed regardless whether we have repl_dev created. This CB is a nice place for upper layer to recover those depends on repl_dev. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/include/homestore/replication_service.hpp | 4 ++++ src/lib/replication/service/raft_repl_service.cpp | 2 ++ src/tests/test_common/hs_repl_test_common.hpp | 1 + src/tests/test_solo_repl_dev.cpp | 1 + 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index d1aa519eb..4ebb1d8f4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.21" + version = "6.5.22" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index c3e56d9a3..bac805dd5 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -75,6 +75,10 @@ class ReplApplication { // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0; + // Called after all the repl devs are found upon restart of the homestore instance. + // it is a nice place for upper layer to recovery anything depends on repl_devs + virtual void on_repl_devs_init_completed() = 0; + // Given the uuid of the peer, get their address and port virtual std::pair< std::string, uint16_t > lookup_peer(replica_id_t uuid) const = 0; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 1ec45d9d0..0fd8940e3 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -131,6 +131,8 @@ void RaftReplService::start() { rdev->on_restart(); } m_config_sb_bufs.clear(); + LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); + m_repl_app->on_repl_devs_init_completed(); // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index c9ff71567..7b93cccb2 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -115,6 +115,7 @@ class HSReplTestHelper : public HSTestHelper { create_repl_dev_listener(homestore::group_id_t group_id) override { return helper_.get_listener(group_id); } + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override { uint16_t port; diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 86efbcc03..3865cd2f3 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -152,6 +152,7 @@ class SoloReplDevTest : public testing::Test { shared< ReplDevListener > create_repl_dev_listener(uuid_t) override { return std::make_shared< Listener >(m_test); } + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); } replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); } }; From 3249b35ecac13ff75e28c98caa92c155d0145619 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 11 Dec 2024 14:01:10 +0800 Subject: [PATCH 034/130] Calling pre_commit for lsn > dc_lsn. driven by nuraft implementation https://github.com/eBay/NuRaft/blob/1adcc6282109c2ddf1121bbc374d48d303145e39/src/handle_append_entries.cxx#L847-L852 the pre_commit is called once log appened to log store, even before persist. For logs recovered from log store, it should call pre-commit with no condition. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 28 +++++++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/conanfile.py b/conanfile.py index 4ebb1d8f4..ce4dccda8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.22" + version = "6.5.23" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 4be1aa78e..f34d14464 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1261,12 +1261,12 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu auto req = m_state_machine->localize_journal_entry_prepare(*entry); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); - // The hint set here will be used by the next after next appendEntry, the next one - // always go with -1 from NuRraft code. - // + // The hint set here will be used by the next after next appendEntry, the next one + // always go with -1 from NuRraft code. + // // We are rejecting this log entry, meaning we can accept previous log entries. - // If there is nothing we can accept(i==0), that maens we are waiting for commit - // of previous lsn, set it to 1 in this case. + // If there is nothing we can accept(i==0), that maens we are waiting for commit + // of previous lsn, set it to 1 in this case. m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); return {true, nuraft::cb_func::ReturnCode::ReturnNull}; } @@ -1485,16 +1485,20 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->add_state(repl_req_state_t::LOG_FLUSHED); RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string()); + // 2. Pre-commit the log entry as in nuraft pre-commit was called once log appended to logstore. + m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); + + // LSN above dc_lsn we forgot their states, they can either + // a. be committed before, but DC_LSN not yet flushed + // b. not yet committed, might be committed or rollback if (repl_lsn > m_rd_sb->durable_commit_lsn) { // In memory state of these blks is lost. Commit them now to avoid usage of same blk twice. commit_blk(rreq); + // add rreq to state machine, state-machine will decide to commit or rollback this rreq. m_state_machine->link_lsn_to_req(rreq, int64_cast(repl_lsn)); return; } - // 2. Pre-commit the log entry - m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); - // 3. Commit the log entry handle_commit(rreq, true /* recovery */); } @@ -1512,8 +1516,8 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); - if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != - HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { + if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || + msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { RD_LOGE("Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, msg->protocol_version); return false; } @@ -1521,8 +1525,8 @@ bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { RD_LOGD("received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, received_crc); // Clear the crc field before verification, because the crc value computed by leader doesn't contain it. msg->crc = 0; - auto computed_crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), - sizeof(snp_repl_dev_data)); + auto computed_crc = + crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), sizeof(snp_repl_dev_data)); if (received_crc != computed_crc) { RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); return false; From 06d0ec85dab25f864b5f62392f89cb0b4569acab Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Sun, 15 Dec 2024 20:02:04 -0700 Subject: [PATCH 035/130] fix potential bug of home raft log store initialization the home raft log store set the log_found_cb and log_replay_done_cb in the result future of open_log_store, which is an async call. so there is a chance that when repl_dev start replaying log , the above two callback are not registered yes, which might lead to some error. this PR move the register out of the future to the initialization phase of log store, which will avoid this case --- conanfile.py | 2 +- src/include/homestore/logstore_service.hpp | 3 ++- src/lib/logstore/log_dev.cpp | 8 +++++++- src/lib/logstore/log_dev.hpp | 6 +++++- src/lib/logstore/log_store_service.cpp | 5 +++-- src/lib/replication/log_store/home_raft_log_store.cpp | 6 ++---- 6 files changed, 20 insertions(+), 10 deletions(-) diff --git a/conanfile.py b/conanfile.py index ce4dccda8..c9a08e722 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.23" + version = "6.5.25" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index 44ba1ab53..18c1e75e3 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -132,7 +132,8 @@ class LogStoreService { * @return std::shared_ptr< HomeLogStore > */ folly::Future< shared< HomeLogStore > > open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, - bool append_mode); + bool append_mode, log_found_cb_t log_found_cb = nullptr, + log_replay_done_cb_t log_replay_done_cb = nullptr); /** * @brief Close the log store instance and free-up the resources diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 7cae168f3..eb51b9db5 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -615,7 +615,9 @@ std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { return lstore; } -folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode) { +folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode, + log_found_cb_t log_found_cb, + log_replay_done_cb_t log_replay_done_cb) { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto it = m_id_logstore_map.find(store_id); if (it == m_id_logstore_map.end()) { @@ -624,6 +626,8 @@ folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t sto logstore_info{ .log_store = nullptr, .append_mode = append_mode, + .log_found_cb = log_found_cb, + .log_replay_done_cb = log_replay_done_cb, })); HS_REL_ASSERT_EQ(happened, true, "Unable to insert logstore into id_logstore_map"); } @@ -656,6 +660,8 @@ void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk& logstore_info& info = it->second; info.log_store = std::make_shared< HomeLogStore >(shared_from_this(), store_id, info.append_mode, sb.m_first_seq_num); + info.log_store->register_log_found_cb(info.log_found_cb); + info.log_store->register_log_replay_done_cb(info.log_replay_done_cb); info.promise.setValue(info.log_store); } diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index cf09e57bc..2875d7823 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -564,6 +564,8 @@ class log_stream_reader { struct logstore_info { std::shared_ptr< HomeLogStore > log_store; bool append_mode; + log_found_cb_t log_found_cb{nullptr}; + log_replay_done_cb_t log_replay_done_cb{nullptr}; folly::SharedPromise< std::shared_ptr< HomeLogStore > > promise{}; }; @@ -708,7 +710,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { /// @param append_mode Is this log store is append mode or not. If append mode, write_async call fails and only /// append_async calls succeed. /// @return future< shared< HomeLogStore > > : Future which will be set with the log store once it is opened - folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode); + folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode, + log_found_cb_t log_found_cb = nullptr, + log_replay_done_cb_t log_replay_done_cb = nullptr); /// @brief Remove the log store and its associated resources /// @param store_id Store id that was created/opened diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index c44291d69..d72268484 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -272,12 +272,13 @@ std::shared_ptr< HomeLogStore > LogStoreService::create_new_log_store(logdev_id_ } folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, - bool append_mode) { + bool append_mode, log_found_cb_t log_found_cb, + log_replay_done_cb_t log_replay_done_cb) { folly::SharedMutexWritePriority::ReadHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); COUNTER_INCREMENT(m_metrics, logstores_count, 1); - return it->second->open_log_store(store_id, append_mode); + return it->second->open_log_store(store_id, append_mode, log_found_cb, log_replay_done_cb); } void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 1e402ac94..dfc40662a 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -97,13 +97,11 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore LOGDEBUGMOD(replication, "Opening existing home log_dev={} log_store={}", m_logdev_id, logstore_id); logstore_service().open_logdev(m_logdev_id); m_log_store_future = logstore_service() - .open_log_store(m_logdev_id, logstore_id, true) - .thenValue([this, log_found_cb, log_replay_done_cb](auto log_store) { + .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb) + .thenValue([this](auto log_store) { m_log_store = std::move(log_store); DEBUG_ASSERT_EQ(m_logstore_id, m_log_store->get_store_id(), "Mismatch in passed and create logstore id"); - m_log_store->register_log_found_cb(log_found_cb); - m_log_store->register_log_replay_done_cb(log_replay_done_cb); REPL_STORE_LOG(DEBUG, "Home Log store created/opened successfully"); }); } From 56c8f350362e31cd85a5c95cf7b1fca6023ef424 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 17 Dec 2024 08:19:19 +0800 Subject: [PATCH 036/130] disable restart for destroy-pending repl-dev (#605) --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 5 ++++- src/lib/logstore/log_store_service.cpp | 5 ++++- .../replication/repl_dev/raft_repl_dev.cpp | 4 +++- .../replication/service/raft_repl_service.cpp | 20 +++++++++++++++++-- 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index c9a08e722..d2ae6b8b4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.25" + version = "6.5.26" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index eb51b9db5..6754a72dd 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -639,7 +639,10 @@ void LogDev::remove_log_store(logstore_id_t store_id) { { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto ret = m_id_logstore_map.erase(store_id); - HS_REL_ASSERT((ret == 1), "try to remove invalid store_id {}-{}", m_logdev_id, store_id); + if (ret == 0) { + LOGWARN("try to remove invalid store_id {}-{}", m_logdev_id, store_id); + return; + } } unreserve_store_id(store_id); } diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index d72268484..8d62bdf05 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -285,7 +285,10 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); - HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); + if (it == m_id_logdev_map.end()) { + HS_LOG(WARN, logstore, "logdev id {} doesnt exist", logdev_id); + return; + } it->second->remove_log_store(store_id); COUNTER_DECREMENT(m_metrics, logstores_count, 1); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f34d14464..da8535602 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1197,11 +1197,13 @@ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { retu void RaftReplDev::permanent_destroy() { RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); - m_rd_sb.destroy(); m_raft_config_sb.destroy(); m_data_journal->remove_store(); logstore_service().destroy_log_dev(m_data_journal->logdev_id()); m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::PERMANENT_DESTROYED; }); + // we should destroy repl_dev superblk only after all the resources are cleaned up, so that is crash recovery + // occurs, we have a chance to find the stale repl_dev and reclaim all the stale resources. + m_rd_sb.destroy(); } void RaftReplDev::leave() { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 0fd8940e3..2ed7a3bc1 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -128,7 +128,8 @@ void RaftReplService::start() { // We need to first load the repl_dev with its config and then attach the raft config to that repl dev. for (auto const& [buf, mblk] : m_config_sb_bufs) { auto rdev = raft_group_config_found(buf, voidptr_cast(mblk)); - rdev->on_restart(); + // if repl_dev is in destroy_pending state, it will not be loaded. + if (rdev) rdev->on_restart(); } m_config_sb_bufs.clear(); LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); @@ -383,7 +384,22 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } if (rd_sb->destroy_pending == 0x1) { - LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, skipping the load", group_id); + LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, reclaim the stale resource", group_id); + // if we do not add the repl_dev to m_rd_map, it will not be permanently destroyed since gc thread finds the + // pending destroy repl_dev only from m_rd_map. so, we should try to reclaim all the repl_dev stale resources + // here. + + // 1 since we permanantly destroy the repl_dev here, it will not join_raft group where raft_server will be + // created. hence , no need to detroy it through nuraft_mesg, where raft_server will be shutdown. + // 2 m_raft_config_sb will be destroyed in raft_group_config_found() method if repl_dev is is not found, so + // skip it. + + // 3 logdev will be destroyed in delete_unopened_logdevs() if we don't open it(create repl_dev) here, so skip + // it. + + // 4 destroy the superblk, and after this, the repl_dev will not be loaded and found again. + rd_sb.destroy(); + return; } From bcf6f6ee177db210c27576f2019d39ba77572b2d Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 17 Dec 2024 19:14:35 -0700 Subject: [PATCH 037/130] fix twice call of leave Removed From cluster will be called twice when committing config_change log and journal_type_t::HS_CTRL_DESTROY in the moved out member, so the destroy future will be setvalue twice , which will lead to an error. this PR fixes this bug --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 31 +++++++++---------- src/lib/replication/repl_dev/raft_repl_dev.h | 4 ++- src/tests/test_common/raft_repl_test_base.hpp | 10 ++---- 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/conanfile.py b/conanfile.py index d2ae6b8b4..a6633a756 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.26" + version = "6.5.27" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index da8535602..d92464c8b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1183,7 +1183,7 @@ nuraft::ptr< nuraft::log_store > RaftReplDev::load_log_store() { return m_data_j int32_t RaftReplDev::server_id() { return m_raft_server_id; } -bool RaftReplDev::is_destroy_pending() const { return (m_rd_sb->destroy_pending == 0x1); } +bool RaftReplDev::is_destroy_pending() const { return (*m_stage.access().get() == repl_dev_stage_t::DESTROYED); } bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl_dev_stage_t::PERMANENT_DESTROYED); } /////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// @@ -1207,6 +1207,19 @@ void RaftReplDev::permanent_destroy() { } void RaftReplDev::leave() { + // this will be called in 3 cases : + // 1. commit log entry of journal_type_t::HS_CTRL_DESTROY + // 2. it is removed from the cluster and the new config(excluding this node) is being committed on this node + // 3. it is removed from the cluster , but the node is down and new config log(excluding this node) is not + // replicated to this removed node. when the node restart, leader will not send any append entry to this node, + // since it is not a member of the raft group. it will become a condidate and send request-vote request to other + // members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node + // is no longer a member of the raft group. + + // leave() will never be called concurrently, since config change and journal_type_t::HS_CTRL_DESTROY are all log + // entry, which will be committed sequentially. + if (is_destroy_pending()) return; + // We update that this repl_dev in destroyed state, actual clean up of resources happen in reaper thread later m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYED; }); m_destroyed_time = Clock::now(); @@ -1288,21 +1301,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu return {true, ret}; } - case nuraft::cb_func::Type::RemovedFromCluster: { - // a node will reach here when : - // 1. it is removed from the cluster and the new config(excluding this node) is being committed on this node - // 2. it is removed from the cluster , but the node is down and new config log(excluding this node) is not - // replicated to this removed node. when the node restart, leader will not send any append entry to this node, - // since it is not a member of the raft group. it will become a condidate and send request-vote request to other - // members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node - // is no longer a member of the raft group. - - // this will lazily cleanup the group - // TODO:cleanup this repl dev ASAP if necessary. - leave(); - - return {true, ret}; - } + // RemovedFromCluster will be handled in nuraft_mesg::generic_raft_event_handler where leave() is called // TODO: Add more type handler if necessary default: diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0550858cf..9e29a5737 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -229,7 +229,9 @@ class RaftReplDev : public ReplDev, * * @param num_reserved_entries The number of reserved entries of the replication log. */ - void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); } + void truncate(uint32_t num_reserved_entries) { + m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); + } void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 7445568b8..21d5fa3f2 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -182,16 +182,12 @@ class TestReplicatedDB : public homestore::ReplDevListener { return make_async_success<>(); } - static int64_t get_next_lsn(uint64_t& obj_id) { - return obj_id & ((1ULL << 63) - 1); - } - static void set_resync_msg_type_bit(uint64_t& obj_id) { - obj_id |= 1ULL << 63; - } + static int64_t get_next_lsn(uint64_t& obj_id) { return obj_id & ((1ULL << 63) - 1); } + static void set_resync_msg_type_bit(uint64_t& obj_id) { obj_id |= 1ULL << 63; } int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - if(RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); return -1; } From 85c50e6854027600e9a0ac6faf305db2bd7c39b7 Mon Sep 17 00:00:00 2001 From: Sanal Date: Wed, 18 Dec 2024 08:59:07 -0800 Subject: [PATCH 038/130] Add lock for log dev read api's. (#612) Avoid concurrent access to journal vdev for truncate, write and read api's from different threads. --- src/lib/logstore/log_dev.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 6754a72dd..313622895 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -262,6 +262,7 @@ int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, } log_buffer LogDev::read(const logdev_key& key) { + std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); if (ec) { @@ -290,6 +291,7 @@ log_buffer LogDev::read(const logdev_key& key) { } void LogDev::read_record_header(const logdev_key& key, serialized_log_record& return_record_header) { + std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); if (ec) LOGERROR("Failed to read from Journal vdev log_dev={} {} {}", m_logdev_id, ec.value(), ec.message()); From 4ff42feff42fd32e9e0c1a50e8c8a42a12dc550f Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 12 Dec 2024 15:20:01 +0800 Subject: [PATCH 039/130] Ensure Consistent LSN Before Opening for Traffic in Raft Group We identified a gap when majority members in a Raft group are down. To save IO operations, we do not persist the last_commit_idx for every commit but instead at regular intervals. Consequently, upon reboot, we may not reflect the latest commit, leaving some logs in the state machine waiting for re-commitment. For instance, if we committed up to LSN 103 but only persisted up to LSN 100, then LSNs 100-103 will remain in the log-store, awaiting re-commitment from the leader. If all members restart after a disaster, they face the following state: - [S1]: commit_idx 100, last_log {idx = 105, term = 1} - S2: commit_idx 100, last_log {idx = 103, term = 1} - S3: commit_idx 100, last_log {idx = 103, term = 1} If S1 opens for traffic at this point, previously committed LSN 102 might return NOT_FOUND to clients due to the uncommitted state. Proposed Solution: - Mark last_log_idx as `traffic_ready_lsn` in the BECOME_LEADER callback. In the example above, it is 105 if S1 becomes the leader. - The leader will not accept IO until it commits up to this `consistent_lsn` (105), ensuring correctness by over-committing. - The HO will call `repl_dev->is_ready_for_traffic()` for each IO. - On followers, the traffic_ready_lsn is zero so it allows all. - On the leader, all requests are rejected until it commits to the `traffic_ready_lsn` (105 in this example). Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 9 ++++-- src/lib/replication/repl_dev/common.cpp | 3 +- .../replication/repl_dev/raft_repl_dev.cpp | 23 +++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 29 +++++++++++++++++-- .../repl_dev/raft_state_machine.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 1 + src/tests/test_common/raft_repl_test_base.hpp | 11 ++++++- 8 files changed, 66 insertions(+), 14 deletions(-) diff --git a/conanfile.py b/conanfile.py index a6633a756..89c254d4a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.27" + version = "6.5.28" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 335cda834..ec8344be0 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -124,8 +124,9 @@ struct snapshot_obj { bool is_last_obj{false}; }; -//HomeStore has some meta information to be transmitted during the baseline resync, -//Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data if needed in the future. +// HomeStore has some meta information to be transmitted during the baseline resync, +// Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data +// if needed in the future. struct snp_repl_dev_data { uint64_t magic_num{HOMESTORE_RESYNC_DATA_MAGIC}; uint32_t protocol_version{HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1}; @@ -468,6 +469,10 @@ class ReplDev { /// @return last_commit_lsn virtual repl_lsn_t get_last_commit_lsn() const = 0; + /// @brief if this replica is ready for accepting client IO. + /// @return true if ready, false otherwise + virtual bool is_ready_for_traffic() const = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 1c2a8c560..e5b34dbcd 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -174,8 +174,7 @@ void repl_req_ctx::release_data() { m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { LOGTRACEMOD(replication, "m_pushed_data addr={}, m_rkey={}, m_lsn={}", - static_cast(m_pushed_data.get()), - m_rkey.to_string(), m_lsn); + static_cast< void* >(m_pushed_data.get()), m_rkey.to_string(), m_lsn); m_pushed_data->send_response(); m_pushed_data = nullptr; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index d92464c8b..2449f7833 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1237,8 +1237,7 @@ void RaftReplDev::leave() { m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete } -std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nuraft::cb_func::Type type, - nuraft::cb_func::Param* param) { +nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, nuraft::cb_func::Param* param) { auto ret = nuraft::cb_func::ReturnCode::Ok; switch (type) { @@ -1283,7 +1282,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu // If there is nothing we can accept(i==0), that maens we are waiting for commit // of previous lsn, set it to 1 in this case. m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); - return {true, nuraft::cb_func::ReturnCode::ReturnNull}; + return nuraft::cb_func::ReturnCode::ReturnNull; } reqs->emplace_back(std::move(req)); } @@ -1298,7 +1297,21 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu sisl::VectorPool< repl_req_ptr_t >::free(reqs); } if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); } - return {true, ret}; + return ret; + } + case nuraft::cb_func::Type::JoinedCluster: + RD_LOGD("Raft channel: Received JoinedCluster, implies become_follower"); + become_follower_cb(); + return nuraft::cb_func::ReturnCode::Ok; + case nuraft::cb_func::Type::BecomeFollower: { + RD_LOGD("Raft channel: Received BecomeFollower"); + become_follower_cb(); + return nuraft::cb_func::ReturnCode::Ok; + } + case nuraft::cb_func::Type::BecomeLeader: { + RD_LOGD("Raft channel: Received BecomeLeader"); + become_leader_cb(); + return nuraft::cb_func::ReturnCode::Ok; } // RemovedFromCluster will be handled in nuraft_mesg::generic_raft_event_handler where leave() is called @@ -1307,7 +1320,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu default: break; } - return {false, ret}; + return nuraft::cb_func::ReturnCode::Ok; } void RaftReplDev::flush_durable_commit_lsn() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 9e29a5737..e9ec2a1ad 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -137,6 +137,10 @@ class RaftReplDev : public ReplDev, std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes std::atomic< repl_lsn_t > m_compact_lsn{0}; // LSN upto which it was compacted, it is used to track where to + // The `traffic_ready_lsn` variable holds the Log Sequence Number (LSN) up to which + // the state machine should committed to before accepting traffic. This threshold ensures that + // all potential committed log be committed before handling incoming requests. + std::atomic< repl_lsn_t > m_traffic_ready_lsn{0}; std::mutex m_sb_mtx; // Lock to protect the repl dev superblock @@ -187,6 +191,13 @@ class RaftReplDev : public ReplDev, bool is_destroy_pending() const; bool is_destroyed() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } + bool is_ready_for_traffic() const { + auto committed_lsn = m_commit_upto_lsn.load(); + auto gate = m_traffic_ready_lsn.load(); + bool ready = committed_lsn >= gate; + if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } + return ready; + } //////////////// Accessor/shortcut methods /////////////////////// nuraft_mesg::repl_service_ctx* group_msg_service(); @@ -206,6 +217,20 @@ class RaftReplDev : public ReplDev, cshared< ReplDevCPContext > get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); + void become_leader_cb() { + auto new_gate = raft_server()->get_last_log_idx(); + repl_lsn_t existing_gate = 0; + if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) { + // was a follower, m_traffic_ready_lsn should be zero on follower. + RD_REL_ASSERT(existing_gate == 0, "existing gate should be zero"); + } + RD_LOGD("become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); + }; + void become_follower_cb() { + // m_traffic_ready_lsn should be zero on follower. + m_traffic_ready_lsn.store(0); + RD_LOGD("become_follower_cb setting traffic_ready_lsn to 0"); + } /// @brief This method is called when the data journal is compacted /// @@ -270,8 +295,8 @@ class RaftReplDev : public ReplDev, std::shared_ptr< nuraft::state_machine > get_state_machine() override; void permanent_destroy() override; void leave() override; - std::pair< bool, nuraft::cb_func::ReturnCode > handle_raft_event(nuraft::cb_func::Type, - nuraft::cb_func::Param*) override; + + nuraft::cb_func::ReturnCode raft_event(nuraft::cb_func::Type, nuraft::cb_func::Param*) override; private: shared< nuraft::log_store > data_journal() { return m_data_journal; } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index f6d76505d..8724d1c4b 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -383,7 +383,7 @@ bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx()); auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); auto res = m_rd.m_listener->apply_snapshot(snp_ctx); - //make sure the changes are flushed. + // make sure the changes are flushed. hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); return res; } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 911f4bd28..e5f33fb63 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -53,6 +53,7 @@ class SoloReplDev : public ReplDev { std::vector< peer_info > get_replication_status() const override { return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}}; } + bool is_ready_for_traffic() const override { return true; } uuid_t group_id() const override { return m_group_id; } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 21d5fa3f2..19a346f5a 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -350,6 +350,10 @@ class TestReplicatedDB : public homestore::ReplDevListener { void validate_db_data() { g_helper->runner().set_num_tasks(inmem_db_.size()); + while (!repl_dev()->is_ready_for_traffic()) { + LOGINFO("not yet ready for traffic, waiting"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); @@ -554,7 +558,8 @@ class RaftReplDevTestBase : public testing::Test { if (dbs_[0]->repl_dev() == nullptr) return; do { - auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); + auto repl_dev = dbs_[0]->repl_dev(); + auto leader_uuid = repl_dev->get_leader_id(); if (leader_uuid.is_nil()) { LOGINFO("Waiting for leader to be elected"); @@ -562,6 +567,10 @@ class RaftReplDevTestBase : public testing::Test { } else if (leader_uuid == g_helper->my_replica_id()) { LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, boost::uuids::to_string(g_helper->my_replica_id())); + if (!repl_dev->is_ready_for_traffic()) { + LOGINFO("leader is not yet ready for traffic, waiting"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); g_helper->runner().set_num_tasks(num_entries); From 9ef2c633133297f4e52cc57085655710ac8c30a5 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:59:49 +0800 Subject: [PATCH 040/130] read_logical_snp_obj: pass user_ctx to prevent memleak (#617) Ensure user_ctx is passed up to NuRaft regardless of the return value to enable the cleanup of the allocated context object. Signed-off-by: Jilong Kou --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_state_machine.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 89c254d4a..b844d1ec8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.28" + version = "6.5.29" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 8724d1c4b..a02964da3 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -330,10 +330,9 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, // Listener will read the snapshot data and we pass through the same. int ret = m_rd.m_listener->read_snapshot_obj(snp_ctx, snp_data); + user_ctx = snp_data->user_ctx; // Have to pass the user_ctx to NuRaft even if ret<0 to get it freed later if (ret < 0) return ret; - // Update user_ctx and whether is_last_obj - user_ctx = snp_data->user_ctx; is_last_obj = snp_data->is_last_obj; // We are doing a copy here. From c2eef300d1db7da16445a094c0cc915a3e2a848d Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 10 Dec 2024 18:15:30 +0800 Subject: [PATCH 041/130] 1. fix handle_error and only trigger handle_error for timeout rreqs in handle_raft_event 2. include concept 'volatile' vs 'non-volatile' for log 3. update replay logic : add BLK_ALLOCATED and DATA_RECEIVED only when data linked and received --- src/include/homestore/replication/repl_dev.h | 3 ++ .../replication/log_store/repl_log_store.cpp | 13 ++++++ .../replication/repl_dev/raft_repl_dev.cpp | 44 ++++++++++++++----- src/lib/replication/repl_dev/raft_repl_dev.h | 9 +++- .../repl_dev/raft_state_machine.cpp | 8 +++- 5 files changed, 62 insertions(+), 15 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index ec8344be0..db79b5f9c 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -152,6 +152,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: int64_t lsn() const { return m_lsn; } bool is_proposer() const { return m_is_proposer; } journal_type_t op_code() const { return m_op_code; } + bool is_volatile() const { return m_is_volatile.load(); } sisl::blob const& header() const { return m_header; } sisl::blob const& key() const { return m_key; } @@ -222,6 +223,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: void set_remote_blkid(RemoteBlkId const& rbid) { m_remote_blkid = rbid; } void set_local_blkid(MultiBlkId const& lbid) { m_local_blkid = lbid; } // Only used during recovery + void set_is_volatile(bool is_volatile) { m_is_volatile.store(is_volatile); } void set_lsn(int64_t lsn); void add_state(repl_req_state_t s); bool add_state_if_not_already(repl_req_state_t s); @@ -248,6 +250,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool m_is_proposer{false}; // Is the repl_req proposed by this node Clock::time_point m_start_time; // Start time of the request journal_type_t m_op_code{journal_type_t::HS_DATA_INLINED}; // Operation code for this request + std::atomic< bool > m_is_volatile{true}; // Is the log still in memory and not flushed to disk yet /////////////// Data related section ///////////////// MultiBlkId m_local_blkid; // Local BlkId for the data diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 36cec9370..97d70ff92 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -93,6 +93,19 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } } } + + // Convert volatile logs to non-volatile logs in state machine + for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { + auto rreq = m_sm.lsn_to_req(lsn); + if (rreq != nullptr) { + if (rreq->has_state(repl_req_state_t::ERRORED)) { + RD_LOGE("Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string()); + continue; + } + rreq->set_is_volatile(false); + } + } + sisl::VectorPool< repl_req_ptr_t >::free(reqs); sisl::VectorPool< repl_req_ptr_t >::free(proposer_reqs); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2449f7833..1270ed761 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -594,7 +594,8 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< }); } -bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms) { +bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms, + std::vector< repl_req_ptr_t >* timeout_rreqs) { std::vector< folly::Future< folly::Unit > > futs; std::vector< repl_req_ptr_t > only_wait_reqs; only_wait_reqs.reserve(rreqs.size()); @@ -621,14 +622,23 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre // We are yet to support reactive fetch from remote. if (is_resync_mode()) { - check_and_fetch_remote_data(std::move(only_wait_reqs)); + check_and_fetch_remote_data(only_wait_reqs); } else { - m_repl_svc.add_to_fetch_queue(shared_from_this(), std::move(only_wait_reqs)); + m_repl_svc.add_to_fetch_queue(shared_from_this(), only_wait_reqs); } // block waiting here until all the futs are ready (data channel filled in and promises are made); - auto all_futs = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms)); - return (all_futs.isReady()); + auto all_futs_ready = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms)).isReady(); + if (!all_futs_ready && timeout_rreqs != nullptr) { + timeout_rreqs->clear(); + for (size_t i{0}; i < futs.size(); ++i) { + if (!futs[i].isReady()) { + timeout_rreqs->emplace_back(only_wait_reqs[i]); + } + } + all_futs_ready = timeout_rreqs->empty(); + } + return all_futs_ready; } void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs) { @@ -953,18 +963,25 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } + RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (!rreq->add_state_if_not_already(repl_req_state_t::ERRORED)) { - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE("Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err); return; } // Remove from the map and thus its no longer accessible from applier_create_req m_repl_key_req_map.erase(rreq->rkey()); - if (rreq->op_code() == journal_type_t::HS_DATA_INLINED) { + // Ensure non-volatile lsn not exist because handle_error should not be called after append entries. + auto exist_rreq = m_state_machine->lsn_to_req(rreq->lsn()); + if (exist_rreq != nullptr && !exist_rreq->is_volatile()) { + HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", + rreq->lsn(), exist_rreq->to_string()); + } + + if (rreq->op_code() == journal_type_t::HS_DATA_LINKED) { // Free the blks which is allocated already - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([blkid](auto&& err) { @@ -1288,8 +1305,9 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } // Wait till we receive the data from its originator for all the requests - if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms))) { - for (auto const& rreq : *reqs) { + std::vector< repl_req_ptr_t > timeout_rreqs; + if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) { + for (auto const& rreq : timeout_rreqs) { handle_error(rreq, ReplServiceError::TIMEOUT); } ret = nuraft::cb_func::ReturnCode::ReturnNull; @@ -1480,11 +1498,15 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RD_DBG_ASSERT(happened, "rreq already exists for rkey={}", rkey.to_string()); uint32_t data_size{0u}; + // If the data is linked and value_size is non-zero, it means blks have been allocated for data. + // Since the log is flushed after data is written, the data has already been received. if ((jentry->code == journal_type_t::HS_DATA_LINKED) && (jentry->value_size > 0)) { MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); data_size = entry_blkid.blk_count() * get_blk_size(); rreq->set_local_blkid(entry_blkid); + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->add_state(repl_req_state_t::DATA_RECEIVED); } rreq->set_lsn(repl_lsn); @@ -1492,8 +1514,6 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->set_lentry(lentry); rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size); // we load the log from log device, implies log flushed. We only flush log after data is written to data device. - rreq->add_state(repl_req_state_t::BLK_ALLOCATED); - rreq->add_state(repl_req_state_t::DATA_RECEIVED); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->add_state(repl_req_state_t::LOG_RECEIVED); rreq->add_state(repl_req_state_t::LOG_FLUSHED); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e9ec2a1ad..28706f716 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -306,8 +306,15 @@ class RaftReplDev : public ReplDev, void fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs); void handle_fetch_data_response(sisl::GenericClientResponse response, std::vector< repl_req_ptr_t > rreqs); bool is_resync_mode(); + + /** + * \brief This method handles errors that occur during append entries or data receiving. + * It should not be called after the append entries phase. + */ void handle_error(repl_req_ptr_t const& rreq, ReplServiceError err); - bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms); + + bool wait_for_data_receive(std::vector < repl_req_ptr_t > const &rreqs, uint64_t timeout_ms, + std::vector < repl_req_ptr_t > *timeout_rreqs = nullptr); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index a02964da3..a91e947ac 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -291,8 +291,12 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { rreq->add_state(repl_req_state_t::LOG_RECEIVED); // reset the rreq created_at time to now https://github.com/eBay/HomeStore/issues/506 rreq->set_created_time(); - [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); - RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list, exist_term={}", lsn, r.first->second->term()); + auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); + if (!r.second) { + RD_LOG(ERROR, "lsn={} already in precommit list, exist_term={}, is_volatile={}", + lsn, r.first->second->term(), r.first->second->is_volatile()); + // TODO: we need to think about the case where volatile is in the map already, is it safe to overwrite it? + } } repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) { From ff4ae7948e2240280094c19522f3f5bab72b9603 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:38:33 +0800 Subject: [PATCH 042/130] Duplication Handling (#611) * Duplication Handling in Blob Write This commit addresses duplication issues on the follower side caused by resync from the leader, it mainly happens when resend snapshot mesg during baseline resync and apply log after snapshot completion. This helps avoid unnecessary GC due to duplicated data. Key Changes: - Utilize allocation hints to check data existence via the application listener. - Introduce `committed_blk_id` in `blk_alloc_hints` to indicate already allocated and committed blocks and pass it from application to HS, preventing reallocation and recommitment. - In `alloc_local_blks()`, if `committed_blk_id` is returned, also add states `DATA_RECEIVED`, `DATA_WRITTEN`, and `DATA_COMMITTED` to skip async_write() and commit_blk(). On the leader side (`RaftReplDev::async_alloc_write`), duplication is treated as an error, as the leader should not propose duplicate data, which may result from mistakes. * Add UT and bump up to 6.6.0 * Move alloc blk logic into rreq.init This commit addresses the issue encountered during a restart. In the previous commit, the DATA_COMMITTED state was used to skip the commit_blk operation. However, after restart, repl_req state DATA_COMMITTED is lost. In this case, if the lsn of log entry is greater than durable_commit_lsn, the data will be committed directly without the opportunity to find if the data is duplicated, as a result, commit_blk may fail due to duplication. --- conanfile.py | 2 +- src/include/homestore/blk.h | 1 + .../homestore/replication/repl_decls.h | 1 + src/include/homestore/replication/repl_dev.h | 7 +- src/lib/replication/repl_dev/common.cpp | 30 ++++++- .../replication/repl_dev/raft_repl_dev.cpp | 63 ++++++++------- .../replication/repl_dev/solo_repl_dev.cpp | 18 ++--- src/tests/test_common/raft_repl_test_base.hpp | 78 ++++++++++++++++++- src/tests/test_raft_repl_dev.cpp | 48 ++++++++++++ 9 files changed, 199 insertions(+), 49 deletions(-) diff --git a/conanfile.py b/conanfile.py index b844d1ec8..3ccfac994 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.29" + version = "6.6.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index 5ee5c06ee..dc8a1f3de 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -253,6 +253,7 @@ struct blk_alloc_hints { blk_temp_t desired_temp{0}; // Temperature hint for the device std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation + std::optional committed_blk_id; // blk id indicates the blk was already allocated and committed, don't allocate and commit again std::optional< stream_id_t > stream_id_hint; // any specific stream to pick std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 192a418bc..1cada6c35 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -30,6 +30,7 @@ VENUM(ReplServiceError, int32_t, NOT_IMPLEMENTED = -10001, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, + DATA_DUPLICATED = -20002, FAILED = -32768); // clang-format on diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index db79b5f9c..d05be3fde 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -36,7 +36,8 @@ VENUM(repl_req_state_t, uint32_t, DATA_WRITTEN = 1 << 2, // Data has been written to the storage LOG_RECEIVED = 1 << 3, // Log is received and waiting for data LOG_FLUSHED = 1 << 4, // Log has been flushed - ERRORED = 1 << 5 // Error has happened and cleaned up + ERRORED = 1 << 5, // Error has happened and cleaned up + DATA_COMMITTED = 1 << 6 // Data has already been committed, used in duplication handling, will skip commit_blk ) VENUM(journal_type_t, uint16_t, @@ -142,8 +143,8 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: public: repl_req_ctx() { m_start_time = Clock::now(); } virtual ~repl_req_ctx(); - void init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size); + ReplServiceError init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, + sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener); /////////////////////// All getters /////////////////////// repl_key const& rkey() const { return m_rkey; } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index e5b34dbcd..b2ba6bce4 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -6,11 +6,12 @@ #include #include "replication/repl_dev/common.h" #include +#include namespace homestore { -void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size) { +ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, + sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener) { m_rkey = std::move(rkey); #ifndef NDEBUG if (data_size > 0) { @@ -24,6 +25,18 @@ void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, m_header = user_header; m_key = key; m_is_jentry_localize_pending = (!is_proposer && (data_size > 0)); // Pending on the applier and with linked data + + // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two threads(data channel and raft channel) are trying to do the same + // thing. So take state mutex and allocate the blk + std::unique_lock< std::mutex > lg(m_state_mtx); + if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto alloc_status = alloc_local_blks(listener, data_size); + if (alloc_status != ReplServiceError::OK) { + LOGERROR("Allocate blk for rreq failed error={}", alloc_status); + } + return alloc_status; + } + return ReplServiceError::OK; } repl_req_ctx::~repl_req_ctx() { @@ -91,6 +104,19 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size); if (hints_result.hasError()) { return hints_result.error(); } + if (hints_result.value().committed_blk_id.has_value()) { + //if the committed_blk_id is already present, use it and skip allocation and commitment + LOGINFO("For Repl_key=[{}] data already exists, skip", rkey().to_string()); + m_local_blkid = hints_result.value().committed_blk_id.value(); + add_state(repl_req_state_t::BLK_ALLOCATED); + add_state(repl_req_state_t::DATA_RECEIVED); + add_state(repl_req_state_t::DATA_WRITTEN); + add_state(repl_req_state_t::DATA_COMMITTED); + m_data_received_promise.setValue(); + m_data_written_promise.setValue(); + return ReplServiceError::OK; + } + auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints_result.value(), m_local_blkid); if (status != BlkAllocStatus::SUCCESS) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 1270ed761..a39d6035b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -184,7 +184,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); rreq->init( repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0); + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { @@ -251,7 +251,7 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not // have a conflict with the old rreq. rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0); + journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { @@ -292,25 +292,28 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + auto status = rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, - header, key, data.size); + header, key, data.size, m_listener); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); + if (status != ReplServiceError::OK) { + RD_LOGD("Initializing rreq failed error={}, failing this req", status); + handle_error(rreq, status); + return; + } + // If it is header only entry, directly propose to the raft if (rreq->has_linked_data()) { - push_data_to_all_followers(rreq, data); - - // Step 1: Alloc Blkid - auto const status = rreq->alloc_local_blks(m_listener, data.size); - if (status != ReplServiceError::OK) { - RD_LOGD("Allocating blks failed error={}, failing this req", status); - handle_error(rreq, status); + if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { + RD_LOGD("data blks has already been allocated and committed, failing this req"); + handle_error(rreq, ReplServiceError::DATA_DUPLICATED); return; } + push_data_to_all_followers(rreq, data); COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); COUNTER_INCREMENT(m_metrics, outstanding_data_write_cnt, 1); @@ -498,32 +501,24 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } } - // We need to allocate the block, since entry doesn't exist or if it exist, two threads are trying to do the same - // thing. So take state mutex and allocate the blk - std::unique_lock< std::mutex > lg(rreq->m_state_mtx); - rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size); - - // There is no data portion, so there is not need to allocate + // rreq->init will allocate the block if it has linked data. + auto status = rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); if (!rreq->has_linked_data()) { return rreq; } - if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { return rreq; } - - auto alloc_status = rreq->alloc_local_blks(m_listener, data_size); #ifdef _PRERELEASE if (is_data_channel) { if (iomgr_flip::instance()->test_flip("fake_reject_append_data_channel")) { LOGINFO("Data Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - alloc_status = ReplServiceError::NO_SPACE_LEFT; + status = ReplServiceError::NO_SPACE_LEFT; } } else { if (iomgr_flip::instance()->test_flip("fake_reject_append_raft_channel")) { LOGINFO("Raft Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - alloc_status = ReplServiceError::NO_SPACE_LEFT; + status = ReplServiceError::NO_SPACE_LEFT; } } #endif - - if (alloc_status != ReplServiceError::OK) { - RD_LOGE("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), alloc_status); + if (status != ReplServiceError::OK) { + RD_LOGD("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be // retried. return nullptr; @@ -930,8 +925,8 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { } } -void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { - commit_blk(rreq); + void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { + if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); } // Remove the request from repl_key map. m_repl_key_req_map.erase(rreq->rkey()); @@ -979,7 +974,12 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", rreq->lsn(), exist_rreq->to_string()); } - + if (err == ReplServiceError::DATA_DUPLICATED) { + RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + m_listener->on_error(err, rreq->header(), rreq->key(), rreq); + rreq->clear(); + return; + } if (rreq->op_code() == journal_type_t::HS_DATA_LINKED) { // Free the blks which is allocated already if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { @@ -1512,7 +1512,12 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->set_lsn(repl_lsn); // keep lentry in scope for the lyfe cycle of the rreq rreq->set_lentry(lentry); - rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size); + auto status = rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), + data_size, m_listener); + if (status != ReplServiceError::OK) { + RD_LOGE("Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); + } + // we load the log from log device, implies log flushed. We only flush log after data is written to data device. rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->add_state(repl_req_state_t::LOG_RECEIVED); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index e5e2cb1a5..4a6a92144 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -30,24 +30,18 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t rreq) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } - rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, - value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, - value.size); - + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, + value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, + header, key, value.size, m_listener); + HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); // If it is header only entry, directly write to the journal - if (rreq->has_linked_data()) { - // Step 1: Alloc Blkid - auto const status = rreq->alloc_local_blks(m_listener, value.size); - HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); - + if (rreq->has_linked_data() && !rreq->has_state(repl_req_state_t::DATA_WRITTEN)) { // Write the data data_service().async_write(value, rreq->local_blkid()).thenValue([this, rreq = std::move(rreq)](auto&& err) { HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener write_journal(std::move(rreq)); }); - } else { - write_journal(std::move(rreq)); - } + } else { write_journal(std::move(rreq)); } } void SoloReplDev::write_journal(repl_req_ptr_t rreq) { diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 19a346f5a..2f7ab9f1c 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -94,8 +94,8 @@ class TestReplicatedDB : public homestore::ReplDevListener { struct journal_header { uint64_t data_size; uint64_t data_pattern; + uint64_t key_id; //put it in header to test duplication in alloc_local_blks }; - journal_header jheader; uint64_t key_id; sisl::sg_list write_sgs; @@ -108,6 +108,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { write_sgs.size = 0; read_sgs.size = 0; key_id = (uint64_t)rand() << 32 | rand(); + jheader.key_id = key_id; } ~test_req() { @@ -171,6 +172,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), *(r_cast< uint64_t const* >(key.cbytes()))); + g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper(error)); } AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { @@ -316,7 +318,16 @@ class TestReplicatedDB : public homestore::ReplDevListener { void free_user_snp_ctx(void*& user_snp_ctx) override {} - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + auto jheader = r_cast(header.cbytes()); + Key k{.id_ = jheader->key_id}; + auto iter = inmem_db_.find(k); + if (iter != inmem_db_.end()) { + LOGDEBUG("data already exists in mem db, key={}", k.id_); + auto hints = blk_alloc_hints{}; + hints.committed_blk_id = iter->second.blkid_; + return hints; + } return blk_alloc_hints{}; } void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override { @@ -335,6 +346,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { auto req = intrusive< test_req >(new test_req()); req->jheader.data_size = data_size; req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + req->jheader.key_id = req->key_id; auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", @@ -591,6 +603,68 @@ class RaftReplDevTestBase : public testing::Test { written_entries_ += num_entries; if (wait_for_commit) { this->wait_for_all_commits(); } } + replica_id_t wait_and_get_leader_id() { + do { + auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else { + return leader_uuid; + } + } while (true); + } + + ReplServiceError write_with_id(uint64_t id, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + if (dbs_[0]->repl_dev() == nullptr) return ReplServiceError::FAILED; + if (db == nullptr) { db = pick_one_db(); } + LOGINFO("Writing data {} since I am the leader my_uuid={}", id, + boost::uuids::to_string(g_helper->my_replica_id())); + auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_num_tasks(1); + g_helper->runner().set_task([this, block_size, db, id]() { + static std::normal_distribution<> num_blks_gen{3.0, 1.0}; + auto data_size = std::max(1L, std::abs(std::lround(num_blks_gen(g_re)))) * block_size; + ASSERT_GT(data_size, 0); + LOGINFO("data_size larger than 0, go ahead, data_size= {}.", data_size); + static std::atomic s_uniq_num{0}; + auto req = intrusive(new TestReplicatedDB::test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + //overwrite the key_id with the id passed in + req->jheader.key_id = id; + req->key_id = id; + + LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", + g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, block_size, req->jheader.data_pattern); + } + + db->repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + }); + + if (!wait_for_commit) { + return ReplServiceError::OK; + } + try { + g_helper->runner().execute().get(); + LOGDEBUG("write data task complete, id={}", id) + } catch (const ReplServiceError& e) { + LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), + id, enum_name(e)); + return e; + } + + written_entries_ += 1; + LOGINFO("wait_for_commit={}", written_entries_); + this->wait_for_all_commits(); + return ReplServiceError::OK; + } void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { this->run_on_leader(db, [this, db]() { diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 169fc7f8a..51ca8e470 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -15,6 +15,54 @@ #include "test_common/raft_repl_test_base.hpp" class RaftReplDevTest : public RaftReplDevTestBase {}; +TEST_F(RaftReplDevTest, Write_Duplicated_Data) { + uint64_t total_writes = 1; + g_helper->runner().qdepth_ = total_writes; + g_helper->runner().total_tasks_ = total_writes; + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + auto leader_uuid = wait_and_get_leader_id(); + + uint64_t id; + TestReplicatedDB::Key stored_key; + TestReplicatedDB::Value stored_val; + if (leader_uuid == g_helper->my_replica_id()) { + id = (uint64_t)rand() << 32 | rand(); + LOGINFO("going to write data with id={}", id); + this->write_with_id(id, true /* wait_for_commit */); + stored_key = dbs_[0]->inmem_db_.cbegin()->first; + ASSERT_EQ(id, stored_key.id_); + } else { + LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", + boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); + } + wait_for_commits(total_writes); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + /* test duplication + if duplication found in leader proposal, reject it; + if duplication found in the followers, skip it. + */ + //1. write the same data again on leader, should fail + if (leader_uuid == g_helper->my_replica_id()) { + auto err = this->write_with_id(id, true /* wait_for_commit */); + ASSERT_EQ(ReplServiceError::DATA_DUPLICATED, err); + + //2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side) + dbs_[0]->inmem_db_.erase(stored_key); + LOGINFO("data with id={} has been deleted from db", id); + err = this->write_with_id(id, true /* wait_for_commit */); + ASSERT_EQ(ReplServiceError::OK, err); + } + if (leader_uuid != g_helper->my_replica_id()) { + wait_for_commits(total_writes + 1); + ASSERT_EQ(dbs_[0]->inmem_db_.size(), total_writes); + } + + g_helper->sync_for_cleanup_start(); +} TEST_F(RaftReplDevTest, Write_Restart_Write) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); From b2ea924e5f8f85791256bc322c406776a28910ba Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 19 Dec 2024 15:51:28 +0800 Subject: [PATCH 043/130] Only call cp_flush for those consumer paticipated in this cp. If a consumer registered after a cp goes to flushing state, the on_switchover_cp cb will not be called for this consumer. In this CP, the ctx for this consumer is nullptr as the consumer never participant in the cp. Previous code calling cp_flush for every consumer, leaving the duty of properly handle the nullptr returned by cp->context(svc_id) to consumer. However, none of the existing consumer handled the case. As a result, we hit an occurance that Index generate a CP sololy, but before the cp fully flushed, other consumer registered and be called into cp_flush(), the replication service, doesnt properly handled the nullptr like below, `get_repl_dev_ctx` was called with this_ptr is null, it is dangerous as invalid memory get accessed. This change is a breaking change for consumer like HO so bump up the version. HomeObject participant the CP as CLIENT, current implementation of HO always returns nullptr for `on_switchover_cp` which will result the CLIENT be excluded from cp_flush after this commit merged. callstack: ``` homestore::ReplSvcCPContext::get_repl_dev_ctx (this=0x0, dev=0x56010ab52b00) at /home/ubuntu/HomeStore/src/lib/replication/service/raft_repl_service.cpp:521 0x0000560106d58f1e in homestore::RaftReplServiceCPHandler::cp_flush (this=, cp=0x56010a467940) at /home/ubuntu/HomeStore/src/lib/replication/service/raft_repl_service.cpp:549 ``` code: ``` auto cp_ctx = s_cast< ReplSvcCPContext* >(cp->context(cp_consumer_t::REPLICATION_SVC)); ... auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get()); ``` Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/lib/checkpoint/cp_mgr.cpp | 3 ++- src/lib/replication/service/generic_repl_svc.cpp | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 3ccfac994..368a4dee6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.0" + version = "6.6.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 62a28596c..c14015c06 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -235,7 +235,8 @@ void CPManager::cp_start_flush(CP* cp) { for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { if (svcid == (size_t)cp_consumer_t::REPLICATION_SVC) { continue; } auto& consumer = m_cp_cb_table[svcid]; - if (consumer) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } + bool participated = (cp->m_contexts[svcid] != nullptr); + if (consumer && participated) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } } folly::collectAllUnsafe(futs).thenValue([this, cp](auto) { diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 9aa2c044d..f5671cb16 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -152,7 +152,9 @@ AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const rep return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } +std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { + return std::make_unique< CPContext >(new_cp); +} folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { From e0fedf4c1f4663a7a308bad68f9cff26805584c2 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 23 Dec 2024 18:03:01 -0800 Subject: [PATCH 044/130] Fix Index recovery path for split(put) (#609) --- conanfile.py | 2 +- .../index/inplace_btree/inplace_btree_store.h | 49 ++++-- src/lib/index/inplace_btree/wb_cache.cpp | 157 ++++++++++++++---- src/lib/index/inplace_btree/wb_cache.hpp | 2 + src/tests/test_index_crash_recovery.cpp | 93 ++++++----- src/tests/test_scripts/index_test.py | 10 +- 6 files changed, 222 insertions(+), 91 deletions(-) diff --git a/conanfile.py b/conanfile.py index 368a4dee6..9e861d23b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.1" + version = "6.6.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/index/inplace_btree/inplace_btree_store.h b/src/lib/index/inplace_btree/inplace_btree_store.h index befede6da..2fe2e7c7f 100644 --- a/src/lib/index/inplace_btree/inplace_btree_store.h +++ b/src/lib/index/inplace_btree/inplace_btree_store.h @@ -102,7 +102,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { void destroy() override { auto cpg = cp_mgr().cp_guard(); - Btree::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); + Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); } @@ -137,11 +137,40 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return ret; } + void repair_root_node(IndexBufferPtr const& idx_buf) override { + LOGTRACEMOD(wbcache, "check if this was the previous root node {} for buf {} ", m_sb->root_node, + idx_buf->to_string()); + if (m_sb->root_node == idx_buf->blkid().to_integer()) { + // This is the root node, we need to update the root node in superblk + LOGTRACEMOD(wbcache, "{} is old root so we need to update the meta node ", idx_buf->to_string()); + BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, + BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); + static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); + auto edge_id = n->next_bnode(); + + BT_DBG_ASSERT(!n->has_valid_edge(), + "root {} already has a valid edge {}, so we should have found the new root node", + n->to_string(), n->get_edge_value().bnode_id()); + n->set_next_bnode(empty_bnodeid); + n->set_edge_value(BtreeLinkInfo{edge_id, 0}); + LOGTRACEMOD(wbcache, "change root node {}: edge updated to {} and invalidate the next node! ", n->node_id(), + edge_id); + auto cpg = cp_mgr().cp_guard(); + write_node_impl(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + + } else { + LOGTRACEMOD(wbcache, "This is not the root node, so we can ignore this repair call for buf {}", + idx_buf->to_string()); + } + } + void repair_node(IndexBufferPtr const& idx_buf) override { if (idx_buf->is_meta_buf()) { // We cannot repair the meta buf on its own, we need to repair the root node which modifies the // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is // attempted to repair, which would have updated the meta_buf already. + LOGTRACEMOD(wbcache, "Ignoring repair on meta buf {} root id {} ", idx_buf->to_string(), + this->root_node_id()); return; } BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, @@ -157,13 +186,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // Only for interior nodes we need to repair its links if (!bn->is_leaf()) { LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); - repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); } if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the // meta_buf with new root as well - on_root_changed(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + LOGTRACEMOD(wbcache, "root change for after repairing {}\n\n", idx_buf->to_string()); + on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); } } @@ -250,10 +280,11 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } - btree_status_t - on_root_changed(BtreeNodePtr const &new_root, void *context) override { + btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ // return btree_status_t::success;} + LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, + new_root->node_id()); m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); @@ -263,7 +294,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf; - wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast(context)); + wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context)); return btree_status_t::success; } @@ -280,7 +311,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Get all original child ids as a support to check if we are beyond the last child node - std::set orig_child_ids; + std::set< bnodeid_t > orig_child_ids; for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { BtreeLinkInfo link_info; parent_node->get_nth_value(i, &link_info, true); @@ -414,9 +445,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } while (true); - if (child_node) { - this->unlock_node(child_node, locktype_t::READ); - } + if (child_node) { this->unlock_node(child_node, locktype_t::READ); } if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { // We shouldn't have an empty interior node in the tree, let's delete it. diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index 9ba839edc..3a8e7b00c 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -418,11 +418,11 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { } buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); - m_vdev->free_blk(buf->m_blkid, s_cast(cp_ctx)); + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); } //////////////////// Recovery Related section ///////////////////////////////// -void IndexWBCache::load_buf(IndexBufferPtr const &buf) { +void IndexWBCache::load_buf(IndexBufferPtr const& buf) { if (buf->m_bytes == nullptr) { buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); @@ -430,6 +430,78 @@ void IndexWBCache::load_buf(IndexBufferPtr const &buf) { } } +struct DagNode { + IndexBufferPtr buffer; + std::vector< shared< DagNode > > children; +}; + +using DagPtr = std::shared_ptr< DagNode >; +using DagMap = std::map< IndexBufferPtr, DagPtr >; + +static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { + std::vector< IndexBufferPtr > bufs; + std::ranges::transform(bufmap, std::back_inserter(bufs), [](const auto& pair) { return pair.second; }); + + auto buildReverseMapping = [](const std::vector< IndexBufferPtr >& buffers) { + std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > > parentToChildren; + for (const auto& buffer : buffers) { + if (buffer->m_up_buffer) { parentToChildren[buffer->m_up_buffer].push_back(buffer); } + } + return parentToChildren; + }; + + std::function< DagPtr(IndexBufferPtr, std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >&) > + buildDag; + buildDag = + [&buildDag](IndexBufferPtr buffer, + std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >& parentToChildren) -> DagPtr { + auto dagNode = std::make_shared< DagNode >(); + dagNode->buffer = buffer; + if (parentToChildren.count(buffer)) { + for (const auto& child : parentToChildren[buffer]) { + dagNode->children.push_back(buildDag(child, parentToChildren)); + } + } + return dagNode; + }; + + auto generateDagMap = [&](const std::vector< IndexBufferPtr >& buffers) { + DagMap dagMap; + auto parentToChildren = buildReverseMapping(buffers); + for (const auto& buffer : buffers) { + if (!buffer->m_up_buffer) { // This is a root buffer + auto dagRoot = buildDag(buffer, parentToChildren); + dagMap[buffer] = dagRoot; + } + } + return dagMap; + }; + + return generateDagMap(bufs); +} + +static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { + std::string str{fmt::format("#_of_dags={}\n", dags.size())}; + int cnt = 1; + for (const auto& [_, dag] : dags) { + std::vector< std::tuple< std::shared_ptr< DagNode >, int, int > > stack; + stack.emplace_back(dag, 0, cnt++); + while (!stack.empty()) { + auto [node, level, index] = stack.back(); + stack.pop_back(); + auto snew = node->buffer->m_created_cp_id == cp_id ? "NEW" : ""; + auto sfree = node->buffer->m_node_freed ? "FREED" : ""; + fmt::format_to(std::back_inserter(str), "{}{}-{} {} {}\n", std::string(level * 4, ' '), index, + node->buffer->to_string(), snew, sfree); + int c = node->children.size(); + for (const auto& d : node->children) { + stack.emplace_back(d, level + 1, c--); + } + } + } + return str; +} + void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -450,9 +522,9 @@ void IndexWBCache::recover(sisl::byte_view sb) { #ifdef _PRERELEASE auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector const &pending_bufs) { + std::vector< IndexBufferPtr > const& pending_bufs) { std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const &[_, buf]: bufs) { + for (auto const& [_, buf] : bufs) { load_buf(buf); fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } @@ -460,7 +532,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { // list of new_bufs if (!pending_bufs.empty()) { fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size()); - for (auto const &buf: pending_bufs) { + for (auto const& buf : pending_bufs) { fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } } @@ -469,6 +541,8 @@ void IndexWBCache::recover(sisl::byte_view sb) { std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); + auto dags = generate_dag_buffers(bufs); + LOGTRACEMOD(wbcache, "Before recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); #endif // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one @@ -482,15 +556,15 @@ void IndexWBCache::recover(sisl::byte_view sb) { // the same blkid which could clash with the blkid next in the buf list. // // On the second pass, we only take part of the parents/siblings and then repair them, if needed. - std::vector pending_bufs; - std::vector deleted_bufs; - for (auto const &[_, buf]: bufs) { + std::vector< IndexBufferPtr > pending_bufs; + std::vector< IndexBufferPtr > deleted_bufs; + for (auto const& [_, buf] : bufs) { if (buf->m_node_freed) { // Freed node load_buf(buf); if (was_node_committed(buf)) { // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link - r_cast(buf->m_bytes)->node_deleted = true; + r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true; write_buf(nullptr, buf, icp_ctx); deleted_bufs.push_back(buf); pending_bufs.push_back(buf->m_up_buffer); @@ -511,9 +585,13 @@ void IndexWBCache::recover(sisl::byte_view sb) { m_vdev->commit_blk(buf->m_blkid); pending_bufs.push_back(buf->m_up_buffer); } else { - // Just ignore it + // Up buffer is not committed, we need to repair it first buf->m_up_buffer->remove_down_buffer(buf); - buf->m_up_buffer = nullptr; + // buf->m_up_buffer = nullptr; + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + update_up_buffer_counters(buf->m_up_buffer); + } } } } @@ -522,25 +600,44 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", pending_bufs.size(), bufs.size(), icp_ctx->id()); LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); + LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); #endif - for (auto const &buf: pending_bufs) { + for (auto const& buf : pending_bufs) { recover_buf(buf); - if (buf->m_bytes != nullptr && r_cast(buf->m_bytes)->node_deleted) { + if (buf->m_bytes != nullptr && r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { // This buffer was marked as deleted during repair, so we also need to free it deleted_bufs.push_back(buf); } } - for (auto const &buf: deleted_bufs) { - m_vdev->free_blk(buf->m_blkid, s_cast(icp_ctx)); + for (auto const& buf : deleted_bufs) { + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); } m_in_recovery = false; m_vdev->recovery_completed(); } -void IndexWBCache::recover_buf(IndexBufferPtr const &buf) { +// if buf->m_wait_for_down_buffers.testz() is true (which means that it has no dependency on any other buffer) then we +// can decrement the wait_for_down_buffers of its up buffer. If the up buffer has up buffer, then we need to decrement +// its wait_for_down_buffers. If the up buffer of up buffer has wait_for_down_buffers as 0, then we need to decrement +// its wait_for_down_buffers. This process continues until we reach the root buffer. If the root buffer has +// wait_for_down_buffers as 0, then we need to decrement its wait_for_down_buffers. +void IndexWBCache::update_up_buffer_counters(IndexBufferPtr const& buf) { + if (buf == nullptr || !buf->m_wait_for_down_buffers.testz() || buf->m_up_buffer == nullptr) { + LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers"); + return; + } + auto grand_buf = buf->m_up_buffer; + grand_buf->remove_down_buffer(buf); + LOGINFOMOD(wbcache, + "Decrementing wait_for_down_buffers for buffer {} due to zero dependency of child {}, Keep going up", + grand_buf->to_string(), buf->to_string()); + update_up_buffer_counters(grand_buf); +} + +void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { if (!buf->m_wait_for_down_buffers.decrement_testz()) { // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer return; @@ -555,6 +652,12 @@ void IndexWBCache::recover_buf(IndexBufferPtr const &buf) { } else { LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that", buf->to_string()); + if (buf->m_up_buffer && buf->m_up_buffer->is_meta_buf()) { + // Our up buffer is a meta buffer, which means old root is dirtied and may need no repair but possible of + // new root on upper level so needs to be retore the edge + LOGTRACEMOD(wbcache, "check root change for without repairing {}", buf->to_string()); + index_service().update_root(buf->m_index_ordinal, buf); + } } if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); } @@ -654,10 +757,8 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const if (buf->is_meta_buf()) { LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const &sb = r_cast(buf.get())->m_sb; - if (!sb.is_empty()) { - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); - } + auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; + if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), @@ -665,15 +766,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const process_write_completion(cp_ctx, buf); } else { LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); - m_vdev->async_write(r_cast(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) - .thenValue([buf, cp_ctx](auto) { - try { - auto &pthis = s_cast(wb_cache()); - pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error &e) { - LOGERROR("Failed to access write-back cache: {}", e.what()); - } - }); + m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) + .thenValue([buf, cp_ctx](auto) { + try { + auto& pthis = s_cast< IndexWBCache& >(wb_cache()); + pthis.process_write_completion(cp_ctx, buf); + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } + }); if (!part_of_batch) { m_vdev->submit_batch(); } } diff --git a/src/lib/index/inplace_btree/wb_cache.hpp b/src/lib/index/inplace_btree/wb_cache.hpp index 25a4c8201..7d10d7f54 100644 --- a/src/lib/index/inplace_btree/wb_cache.hpp +++ b/src/lib/index/inplace_btree/wb_cache.hpp @@ -41,6 +41,7 @@ class IndexWBCache : public IndexWBCacheBase { std::mutex m_flush_mtx; void* m_meta_blk; bool m_in_recovery{false}; + public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); @@ -78,5 +79,6 @@ class IndexWBCache : public IndexWBCacheBase { void recover_buf(IndexBufferPtr const& buf); bool was_node_committed(IndexBufferPtr const& buf); void load_buf(IndexBufferPtr const& buf); + void update_up_buffer_counters(IndexBufferPtr const& buf); }; } // namespace homestore diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index cac120a93..2599b5306 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -36,29 +36,29 @@ SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup SISL_OPTION_GROUP( test_index_crash_recovery, (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value()->default_value("500"), "number"), + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value()->default_value("360000"), "seconds"), + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), (num_rounds, "", "num_rounds", "number of rounds to test with", - ::cxxopts::value()->default_value("100"), "number"), + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", - ::cxxopts::value()->default_value("40"), "number"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value()->default_value("20"), ""), - (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", - ::cxxopts::value()->default_value("6"), ""), + ::cxxopts::value< uint32_t >()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("20"), + ""), + (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("6"), + ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value()->default_value("1000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), + ::cxxopts::value< bool >()->default_value("1"), ""), (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -248,7 +248,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); - m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); + m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >(); m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); return m_test->m_bt; } @@ -276,7 +276,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->m_cfg = BtreeConfig(hs()->index_service().node_size()); this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); - this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); + this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >(); LOGINFO("Node size {}, max_keys_in_node {}, min_keys_in_node {}", this->m_cfg.node_size(), this->m_cfg.m_max_keys_in_node, this->m_cfg.m_min_keys_in_node); auto uuid = boost::uuids::random_generator()(); @@ -337,7 +337,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void reapply_after_crash() { ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; snapshot_map.load(m_shadow_filename); - LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); + // LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); auto diff = this->m_shadow_map.diff(snapshot_map); // visualize tree after crash @@ -345,13 +345,14 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT // this->visualize_keys(recovered_tree_filename); // LOGINFO(" tree after recovered stored in {}", recovered_tree_filename); - std::string dif_str = "KEY \tADDITION\n"; - for (const auto& [k, addition] : diff) { - dif_str += fmt::format(" {} \t{}\n", k.key(), addition); + std::string dif_str = "Keys["; + for (const auto& [k, _] : diff) { + dif_str += fmt::format("{} ", k.key()); } + dif_str += "]"; LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); - for (const auto &[k, addition]: diff) { + for (const auto& [k, addition] : diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); // this->visualize_keys(recovered_tree_filename); if (addition) { @@ -400,15 +401,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void crash_and_recover(uint32_t s_key, uint32_t e_key) { - this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); trigger_cp(false); this->wait_for_crash_recovery(); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); - this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); this->reapply_after_crash(); - this->print_keys("Post reapply, btree structure: "); + // this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); @@ -419,24 +420,28 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::set< uint64_t > new_keys; std::transform(operations.begin(), operations.end(), std::inserter(new_keys, new_keys.end()), [](const Operation& operation) { return operation.first; }); - uint32_t count = 1; + uint32_t count = 0; this->m_shadow_map.foreach ([this, new_keys, &count](K key, V value) { // discard the new keys to check if (new_keys.find(key.key()) != new_keys.end()) { return; } + count++; auto copy_key = std::make_unique< K >(); *copy_key = key; auto out_v = std::make_unique< V >(); auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; req.enable_route_tracing(); const auto ret = this->m_bt->get(req); + if (ret != btree_status_t::success) { + this->print_keys(fmt::format("Sanity check: key {}", key.key())); + this->dump_to_file("sanity_fail.txt"); + } ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; }); LOGINFO("Sanity check passed for {} keys!", count); - } void crash_and_recover(OperationList& operations, std::string filename = "") { - this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -455,7 +460,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); } - this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -467,7 +472,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); } - this->print_keys("Post reapply, btree structure: "); + // this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -628,7 +633,7 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { test_common::HSTestHelper::trigger_cp(true); this->get_all(); this->m_shadow_map.save(this->m_shadow_filename); - this->print_keys("reapply: after preload"); + // this->print_keys("reapply: after preload"); this->visualize_keys("tree_after_preload.dot"); for (uint32_t round = 1; @@ -715,28 +720,27 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, this->tree_key_count() * 100.0 / num_entries); } - this->print_keys(fmt::format("reapply: after round {}", round)); + // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; } } // Basic reverse and forward order remove with different flip points TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { - vector flip_points = { - "crash_flush_on_merge_at_parent", - "crash_flush_on_merge_at_left_child", + vector< std::string > flip_points = { + "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", // "crash_flush_on_freed_child", }; for (size_t i = 0; i < flip_points.size(); ++i) { this->reset_btree(); - auto &flip_point = flip_points[i]; + auto& flip_point = flip_points[i]; LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); // Populate some keys [1,num_entries) and trigger cp to persist - LOGINFO("Step {}-1: Populate some keys and flush", i+1); - auto const num_entries = SISL_OPTIONS["num_entries"].as(); + LOGINFO("Step {}-1: Populate some keys and flush", i + 1); + auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); for (auto k = 0u; k < num_entries; ++k) { this->put(k, btree_put_type::INSERT, true /* expect_success */); } @@ -747,7 +751,8 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { // Split keys into batches and remove the last one in reverse order LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); - int batch_num = 4; { + int batch_num = 4; + { int n = batch_num; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; @@ -758,8 +763,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); this->set_basic_flip(flip_point); - for (auto [k, _]: ops) { - LOGINFO("Removing key {}", k); + for (auto [k, _] : ops) { this->remove_one(k, true); } this->visualize_keys("tree_merge_before_first_crash.dot"); @@ -780,8 +784,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); this->set_basic_flip(flip_point); - for (auto [k, _]: ops) { - LOGINFO("Removing key {}", k); + for (auto [k, _] : ops) { this->remove_one(k, true); } this->visualize_keys("tree_merge_before_second_crash.dot"); @@ -802,8 +805,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); this->set_basic_flip(flip_point); - for (auto [k, _]: ops) { - LOGINFO("Removing key {}", k); + for (auto [k, _] : ops) { this->remove_one(k, true); } this->visualize_keys("tree_merge_before_third_crash.dot"); @@ -827,9 +829,8 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { // vector flips = { // "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", // }; -// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); -// OperationList operations; -// for (size_t i = 0; i < flips.size(); ++i) { +// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 +// /*end_range*/); OperationList operations; for (size_t i = 0; i < flips.size(); ++i) { // this->reset_btree(); // LOGINFO("Step {}-1: Init btree", i + 1); // for (auto k = 0u; k < num_entries; ++k) { diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index dd2f8f010..d4734ac82 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -51,10 +51,10 @@ def parse_arguments(): parser.add_argument('--dev_list', help='Device list', default='') parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) - parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=10) parser.add_argument('--min_keys_in_node', help='Minimum num of keys in btree nodes', type=int, default=2) - parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) - parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60) + parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=1000) + parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=100) # Parse the known arguments and ignore any unknown arguments args, unknown = parser.parse_known_args() @@ -94,10 +94,10 @@ def long_running_clean_shutdown(options, type=0): def long_running_crash_put(options): print("Long running crash put started") - options['num_entries'] = 131072 # 128K + options['num_entries'] = 1310720 # 1280K options['init_device'] = True options['run_time'] = 14400 # 4 hours - options['preload_size'] = 100 + options['preload_size'] = 1024 print(f"options: {options}") run_crash_test(options) print("Long running crash put completed") From 6b43edba4bdc39e450bc3eda4908d661fc99e894 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 23 Dec 2024 14:54:28 +0800 Subject: [PATCH 045/130] reset rreq time every time we reuse a rreq --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 9e861d23b..7bc2b2928 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.2" + version = "6.6.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index a39d6035b..c408f82d1 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -489,8 +489,9 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ auto rreq = it->second; if (!happened) { - // We already have the entry in the map, check if we are already allocated the blk by previous caller, in - // that case we need to return the req. + // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during use. + rreq->set_created_time(); + // Check if we are already allocated the blk by previous caller, in that case we need to return the req. if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { // Do validation if we have the correct mapping // RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}", From 71293589d11c95a5810226d6a3e9ab908910283a Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 25 Dec 2024 06:32:34 +0800 Subject: [PATCH 046/130] fix HomeRaftLogStore::last_entry (#619) 1 If no log entry exists: a dummy constant entry with value set to null and term set to zero. 2 m_last_durable_lsn is initialized as -1, and only be updated in end_of_batch. we should set it to the tail_lsn of log store after all the log entries are replayed --- src/include/homestore/logstore/log_store.hpp | 1 + .../log_store/home_raft_log_store.cpp | 7 ++++-- .../replication/repl_dev/raft_repl_dev.cpp | 24 +++++++++++-------- src/lib/replication/repl_dev/raft_repl_dev.h | 5 ++-- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index a2091f114..18a806545 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -231,6 +231,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { bool rollback(logstore_seq_num_t to_lsn); auto start_lsn() const { return m_start_lsn.load(std::memory_order_acquire); } + auto tail_lsn() const { return m_tail_lsn.load(std::memory_order_acquire); } nlohmann::json dump_log_store(const log_dump_req& dump_req = log_dump_req()); diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index dfc40662a..b0a41d3e1 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -145,8 +145,11 @@ nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::last_entry() const { auto log_bytes = m_log_store->read_sync(max_seq); nle = to_nuraft_log_entry(log_bytes); } catch (const std::exception& e) { - REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}", max_seq); - throw e; + // all the log entries are truncated, so we should return a dummy log entry. + REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}, {}", max_seq, e.what()); + // according to the contract, we should return a dummy log entry if the index is out of range. + // https://github.com/eBay/NuRaft/blob/50e2f949503081262cb21923e633eaa8dacad8fa/include/libnuraft/log_store.hxx#L56 + nle = m_dummy_log_entry; } return nle; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index c408f82d1..7f2e07f3b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -39,7 +39,10 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_data_journal = std::make_shared< ReplLogStore >( *this, *m_state_machine, m_rd_sb->logdev_id, m_rd_sb->logstore_id, [this](logstore_seq_num_t lsn, log_buffer buf, void* key) { on_log_found(lsn, buf, key); }, - [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) { m_log_store_replay_done = true; }); + [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) { + m_log_store_replay_done = true; + set_log_store_last_durable_lsn(hs->tail_lsn()); + }); m_next_dsn = m_rd_sb->last_applied_dsn + 1; m_commit_upto_lsn = m_rd_sb->durable_commit_lsn; m_last_flushed_commit_lsn = m_commit_upto_lsn; @@ -292,9 +295,10 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, - header, key, data.size, m_listener); + auto status = rreq->init( + repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, + key, data.size, m_listener); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); @@ -628,9 +632,7 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre if (!all_futs_ready && timeout_rreqs != nullptr) { timeout_rreqs->clear(); for (size_t i{0}; i < futs.size(); ++i) { - if (!futs[i].isReady()) { - timeout_rreqs->emplace_back(only_wait_reqs[i]); - } + if (!futs[i].isReady()) { timeout_rreqs->emplace_back(only_wait_reqs[i]); } } all_futs_ready = timeout_rreqs->empty(); } @@ -926,7 +928,7 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { } } - void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { +void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); } // Remove the request from repl_key map. @@ -972,8 +974,8 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) // Ensure non-volatile lsn not exist because handle_error should not be called after append entries. auto exist_rreq = m_state_machine->lsn_to_req(rreq->lsn()); if (exist_rreq != nullptr && !exist_rreq->is_volatile()) { - HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", - rreq->lsn(), exist_rreq->to_string()); + HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", rreq->lsn(), + exist_rreq->to_string()); } if (err == ReplServiceError::DATA_DUPLICATED) { RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); @@ -1458,6 +1460,8 @@ void RaftReplDev::gc_repl_reqs() { } } +void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journal->set_last_durable_lsn(lsn); } + void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 28706f716..0521d1aac 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -313,9 +313,10 @@ class RaftReplDev : public ReplDev, */ void handle_error(repl_req_ptr_t const& rreq, ReplServiceError err); - bool wait_for_data_receive(std::vector < repl_req_ptr_t > const &rreqs, uint64_t timeout_ms, - std::vector < repl_req_ptr_t > *timeout_rreqs = nullptr); + bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms, + std::vector< repl_req_ptr_t >* timeout_rreqs = nullptr); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); + void set_log_store_last_durable_lsn(store_lsn_t lsn); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); From 9f17189ec6401875683144493eef2652c0418a57 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Mon, 30 Dec 2024 18:38:57 +0800 Subject: [PATCH 047/130] minor fix for homeobject's homestore_test --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7bc2b2928..613d76f5c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.3" + version = "6.6.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0521d1aac..df2668abc 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -191,7 +191,7 @@ class RaftReplDev : public ReplDev, bool is_destroy_pending() const; bool is_destroyed() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } - bool is_ready_for_traffic() const { + bool is_ready_for_traffic() const override { auto committed_lsn = m_commit_upto_lsn.load(); auto gate = m_traffic_ready_lsn.load(); bool ready = committed_lsn >= gate; From 5d4828c868185bb5a82f564fb3f9022512fc1217 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 31 Dec 2024 16:45:34 +0800 Subject: [PATCH 048/130] update last_commit_lsn in commit_config --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 14 ++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 1 + .../replication/repl_dev/raft_state_machine.cpp | 2 ++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 613d76f5c..1ccccbae9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.4" + version = "6.6.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 7f2e07f3b..15e01ea6e 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -959,6 +959,20 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { if (!rreq->is_proposer()) { rreq->clear(); } } +void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) { + // when reaching here, the new config has already been applied to the cluster. + // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here. + + // keep this variable in case it is needed later + (void) new_conf; + auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); + RD_DBG_ASSERT_GT(lsn, prev_lsn, + "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}", + lsn, prev_lsn); + RD_DBG_ASSERT(m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn), + "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); +} + void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index df2668abc..f45ddc61c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -207,6 +207,7 @@ class RaftReplDev : public ReplDev, //////////////// Methods needed for other Raft classes to access ///////////////// void use_config(json_superblk raft_config_sb); void handle_commit(repl_req_ptr_t rreq, bool recovery = false); + void handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf); void handle_rollback(repl_req_ptr_t rreq); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index a91e947ac..5e98766df 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -228,6 +228,8 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt RD_LOG(INFO, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, m_rd.group_id_str()); #endif + + m_rd.handle_config_commit(s_cast< repl_lsn_t >(log_idx), new_conf); } void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { From ffe797ef762bfc2bf10aa86a20e4b56e5550f2cc Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 26 Dec 2024 17:31:26 +0800 Subject: [PATCH 049/130] Remove log store truncation from resource mgr. Currently both resource_mgr and raft can call log store's truncate, but resource_mgr will not truncate logs whose lsn less than compact lsn. That means resource_mgr just re-truncate logs which will be / has been truncated in compact. But if resource_mgr and raft call truncate concurrently, crash will happen. So this commit remove it. --- conanfile.py | 2 +- src/lib/common/resource_mgr.cpp | 18 +++++----- .../log_store/home_raft_log_store.cpp | 4 +++ .../log_store/home_raft_log_store.h | 2 ++ .../replication/repl_dev/raft_repl_dev.cpp | 8 ++--- src/lib/replication/repl_dev/raft_repl_dev.h | 2 ++ src/tests/test_common/raft_repl_test_base.hpp | 2 +- src/tests/test_log_dev.cpp | 34 ++++++++++++++++++- src/tests/test_raft_repl_dev.cpp | 2 +- 9 files changed, 57 insertions(+), 17 deletions(-) diff --git a/conanfile.py b/conanfile.py index 1ccccbae9..aead37f4a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.5" + version = "6.6.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp index 8719089b9..173b8e0a1 100644 --- a/src/lib/common/resource_mgr.cpp +++ b/src/lib/common/resource_mgr.cpp @@ -48,14 +48,16 @@ void ResourceMgr::stop() { // void ResourceMgr::trigger_truncate() { if (hs()->has_repl_data_service()) { - // first make sure all repl dev's underlying raft log store make corresponding reservation during - // truncate -- set the safe truncate boundary for each raft log store; - hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) { - // lock is already taken by repl service layer; - std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate( - HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold)); - }); - + /* + * DO NOT NEED : raft will truncate logs. + * // first make sure all repl dev's underlying raft log store make corresponding reservation during + * // truncate -- set the safe truncate boundary for each raft log store; + * hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) { + * // lock is already taken by repl service layer; + * std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate( + * HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold)); + * }); + */ // next do device truncate which go through all logdevs and truncate them; hs()->logstore_service().device_truncate(); } diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index b0a41d3e1..f4e492f29 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -47,6 +47,9 @@ static uint64_t extract_term(const log_buffer& log_bytes) { return (*r_cast< uint64_t const* >(raw_ptr)); } +#if 0 +// Since truncate_lsn can not accross compact_lsn passed down by raft server +// and compact will truncate logs upto compact_lsn, we don't need to re-truncate in this function now. void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn) { auto const last_lsn = last_index(); auto const start_lsn = start_index(); @@ -77,6 +80,7 @@ void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_ls m_log_store->truncate(truncate_lsn); } } +#endif HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore_id, log_found_cb_t const& log_found_cb, log_replay_done_cb_t const& log_replay_done_cb) : diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index 3c4c021ef..d2c0fd57b 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -204,6 +204,7 @@ class HomeRaftLogStore : public nuraft::log_store { */ ulong last_index() const; +#if 0 /** * Truncates the log store * @@ -212,6 +213,7 @@ class HomeRaftLogStore : public nuraft::log_store { * LSN; */ void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn); +#endif void wait_for_log_store_ready(); void set_last_durable_lsn(repl_lsn_t lsn); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 15e01ea6e..fb8317b02 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -966,11 +966,9 @@ void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config // keep this variable in case it is needed later (void) new_conf; auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); - RD_DBG_ASSERT_GT(lsn, prev_lsn, - "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}", - lsn, prev_lsn); - RD_DBG_ASSERT(m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn), - "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); + if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) { + RD_LOGE("Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); + } } void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index f45ddc61c..5e66e18f8 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -250,6 +250,7 @@ class RaftReplDev : public ReplDev, */ void on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done); +#if 0 /** * Truncates the replication log by providing a specified number of reserved entries. * @@ -258,6 +259,7 @@ class RaftReplDev : public ReplDev, void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); } +#endif void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 2f7ab9f1c..11c6d6bc2 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -420,7 +420,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { void truncate(int num_reserved_entries) { auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - raft_repl_dev->truncate(num_reserved_entries); + // raft_repl_dev->truncate(num_reserved_entries); LOGINFO("Manually truncated"); } diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index cbe8ff760..af9173d9f 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -200,8 +200,12 @@ class LogDevTest : public ::testing::Test { read_all_verify(log_store); } - void truncate_validate(std::shared_ptr< HomeLogStore > log_store) { + void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* last_lsn = nullptr) { auto upto = log_store->get_contiguous_completed_seq_num(-1); + if (last_lsn) { + ASSERT_EQ(upto, *last_lsn); + } + LOGINFO("truncate_validate upto {}", upto); log_store->truncate(upto); read_all_verify(log_store); @@ -304,6 +308,34 @@ TEST_F(LogDevTest, Rollback) { rollback_records_validate(log_store, 0 /* expected_count */); } +TEST_F(LogDevTest, ReTruncate) { + LOGINFO("Step 1: Create a single logstore to start re-truncate test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + LOGINFO("Step 2: Issue sequential inserts with q depth of 10"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Truncate all entries"); + logstore_seq_num_t ls_last_lsn = 499; + truncate_validate(log_store, &ls_last_lsn); + ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn); + ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn); + + LOGINFO("Step 4: Truncate again"); + truncate_validate(log_store, &ls_last_lsn); + ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn); + ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn); + + LOGINFO("Step 5: Read and verify all entries again"); + read_all_verify(log_store); +} + TEST_F(LogDevTest, CreateRemoveLogDev) { auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); std::vector< std::shared_ptr< HomeLogStore > > log_stores; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 51ca8e470..c419e6b1d 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -409,7 +409,7 @@ TEST_F(RaftReplDevTest, BaselineTest) { // Leader does manual snapshot and truncate LOGINFO("Leader create snapshot and truncate"); this->create_snapshot(); - this->truncate(0); + // this->truncate(0); } } From 824392f91e953584ea6bdad40a06ee9bb81cd70d Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 6 Jan 2025 11:06:02 +0800 Subject: [PATCH 050/130] Support async snapshot io config --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 4 ++++ src/lib/replication/service/raft_repl_service.cpp | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index aead37f4a..ea0486930 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.6" + version = "6.6.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index a335281c5..81b67b5b9 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -288,6 +288,10 @@ table Consensus { // Log difference from leader's point of view, to determine if the // follower is laggy and if so, leader will stop pushing data until it drops under this threshold. laggy_threshold: int64 = 2000; + + // Reading snapshot objects will be done by a background thread asynchronously + // instead of synchronous read by Raft worker threads + use_bg_thread_for_snapshot_io_: bool = true; } table HomeStoreSettings { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 2ed7a3bc1..23ff2db89 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -110,6 +110,7 @@ void RaftReplService::start() { // There is no callback available for handling and localizing the log entries within the pack, which could // result in data corruption. r_params.use_new_joiner_type_ = true; + r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io_); r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); From 9aa07e91f27f67c080433c759d9098e6ff8174af Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 9 Jan 2025 10:11:36 +0800 Subject: [PATCH 051/130] Avoid replaying the last flushed log entry --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index ea0486930..ee3c44e6f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.7" + version = "6.6.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index fb8317b02..3fd68ee24 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1477,7 +1477,7 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn - if (repl_lsn < m_rd_sb->checkpoint_lsn) { return; } + if (repl_lsn <= m_rd_sb->checkpoint_lsn) { return; } // 1. Get the log entry and prepare rreq auto const lentry = to_nuraft_log_entry(buf); @@ -1489,8 +1489,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, journal_entry=[{}] ", - jentry->server_id, lentry->get_term(), jentry->to_string()); + RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ", + jentry->server_id, lentry->get_term(), repl_lsn, jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; From ee9d76556fd360c774d56f65440adc9d66f5c8ee Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 2 Jan 2025 17:52:53 +0800 Subject: [PATCH 052/130] skip appending dummy logs to log dev 1. directly update indices when there are holes in baseline resync 2. update definition of logdev_key::out_of_bound_ld_key and use it to imply log dev can truncate freely. 3. remove is_active check in HomeLogStore::flush to unblock flush when there are holes in m_records --- conanfile.py | 2 +- src/include/homestore/logstore/log_store.hpp | 10 + .../homestore/logstore/log_store_internal.hpp | 3 +- src/lib/logstore/log_dev.cpp | 18 +- src/lib/logstore/log_dev.hpp | 5 +- src/lib/logstore/log_store.cpp | 33 +- .../log_store/home_raft_log_store.cpp | 10 +- src/tests/test_log_dev.cpp | 288 +++++++++++++++++- 8 files changed, 338 insertions(+), 31 deletions(-) diff --git a/conanfile.py b/conanfile.py index ee3c44e6f..32cbdc7d6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.8" + version = "6.6.10" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index 18a806545..91735be79 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -173,6 +173,15 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { logdev_key get_trunc_ld_key() const { return m_trunc_ld_key; } + /** + * @brief Get the truncation information for this log store. It is called during log device truncation + * + * @return tuple of (start_lsn, trunc_ld_key, tail_lsn) If the log store is empty, it will return + * an out_of_bound_ld_key as trunc_ld_key. + * + * @note ensure that no new logs are flushed between calling this function and completing the truncation, + * as this could result in an inaccurate out_of_bound_ld_key. + * */ std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > truncate_info() const; sisl::StreamTracker< logstore_record >& log_records() { return m_records; } @@ -232,6 +241,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { auto start_lsn() const { return m_start_lsn.load(std::memory_order_acquire); } auto tail_lsn() const { return m_tail_lsn.load(std::memory_order_acquire); } + auto next_lsn() const { return m_next_lsn.load(std::memory_order_acquire); } nlohmann::json dump_log_store(const log_dump_req& dump_req = log_dump_req()); diff --git a/src/include/homestore/logstore/log_store_internal.hpp b/src/include/homestore/logstore/log_store_internal.hpp index 551f15ea8..9b7019cfb 100644 --- a/src/include/homestore/logstore/log_store_internal.hpp +++ b/src/include/homestore/logstore/log_store_internal.hpp @@ -85,7 +85,8 @@ struct logdev_key { std::string to_string() const { return fmt::format("Logid={} devoffset={}", idx, dev_offset); } static const logdev_key& out_of_bound_ld_key() { - static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(), 0}; + static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(), + std::numeric_limits< off_t >::max()}; return s_out_of_bound_ld_key; } }; diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 313622895..be5cd976a 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -133,6 +133,7 @@ void LogDev::stop() { m_log_idx.store(0); m_pending_flush_size.store(0); m_last_flush_idx = -1; + m_last_flush_ld_key = logdev_key{0, 0}; m_last_truncate_idx = -1; m_last_crc = INVALID_CRC32_VALUE; @@ -499,6 +500,7 @@ void LogDev::on_flush_completion(LogGroup* lg) { free_log_group(lg); m_log_records->truncate(upto_indx); m_last_flush_idx = upto_indx; + m_last_flush_ld_key = logdev_key{from_indx, dev_offset}; // since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion for (auto const& [idx, req] : req_map) { @@ -528,20 +530,18 @@ uint64_t LogDev::truncate() { auto lstore = store.log_store; if (lstore == nullptr) { continue; } auto const [trunc_lsn, trunc_ld_key, tail_lsn] = lstore->truncate_info(); - if (trunc_lsn == tail_lsn) { - THIS_LOGDEV_LOG(DEBUG, "Store_id={} didn't have any writes since last truncation, skipping ", store_id); - m_logdev_meta.remove_all_rollback_records(store_id, m_stopped /* persist_now */); - continue; - } - HS_DBG_ASSERT_GE(trunc_ld_key.idx, m_last_truncate_idx, "Trying to truncate logid which is already truncated"); m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), m_stopped /* persist_now */); - // We found a new minimum logdev_key that we can truncate to - if (trunc_ld_key.idx > 0 && trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } + if (trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } + } + + // All log stores are empty, we can truncate logs depends on the last flushed logdev_key + if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) { + min_safe_ld_key = m_last_flush_ld_key; } // There are no writes or no truncation called for any of the store, so we can't truncate anything - if (min_safe_ld_key == logdev_key::out_of_bound_ld_key() || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; + if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx); diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 2875d7823..5a8fafc2c 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -795,8 +795,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; - logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx - logid_t m_last_truncate_idx{std::numeric_limits< logid_t >::min()}; // logdev truncate up to this idx + logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx + logdev_key m_last_flush_ld_key{0,0}; // Left interval of the last flush, 0 indicates the very beginning of logdev + logid_t m_last_truncate_idx{-1}; // Logdev truncate up to this idx crc32_t m_last_crc{INVALID_CRC32_VALUE}; // LogDev Info block related fields diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index e2ea0f333..1e3a1bea6 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -188,12 +188,27 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate #endif + // In normal write and compact path, upto_lsn is expected to be no larger than m_tail_lsn after the flush. + // So upto_lsn > m_tail_lsn is expected to exist only in baseline resync path. + // In baseline resync path, we truncate all entries up to upto_lsn, and update m_tail_lsn and m_next_lsn + // to make sure logstore's idx is always = raft's idx - 1. if (upto_lsn > m_tail_lsn) { THIS_LOGSTORE_LOG(WARN, - "Truncating issued on lsn={} which is greater than tail_lsn={}, truncating upto tail_lsn", + "Truncating issued on lsn={} which is greater than tail_lsn={}", upto_lsn, m_tail_lsn.load(std::memory_order_relaxed)); - m_trunc_ld_key = m_records.at(m_tail_lsn).m_trunc_key; - upto_lsn = m_tail_lsn; + // update m_tail_lsn if it is less than upto_lsn + auto current_tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); + while (current_tail_lsn < upto_lsn && + !m_tail_lsn.compare_exchange_weak(current_tail_lsn, upto_lsn, std::memory_order_relaxed)) {} + + // update m_next_lsn if it is less than upto_lsn + 1 + auto current_next_lsn = m_next_lsn.load(std::memory_order_relaxed); + while (current_next_lsn < upto_lsn + 1 && + !m_next_lsn.compare_exchange_weak(current_next_lsn, upto_lsn + 1, std::memory_order_relaxed)) {} + + // insert an empty record to make sure m_records has enough size to truncate + logdev_key empty_ld_key; + m_records.create_and_complete(upto_lsn, logstore_record(empty_ld_key, empty_ld_key)); } else { m_trunc_ld_key = m_records.at(upto_lsn).m_trunc_key; THIS_LOGSTORE_LOG(TRACE, "Truncating logstore upto lsn={} , m_trunc_ld_key index {} offset {}", upto_lsn, @@ -206,7 +221,12 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::truncate_info() const { auto const trunc_lsn = m_start_lsn.load(std::memory_order_relaxed) - 1; - return std::make_tuple(trunc_lsn, m_trunc_ld_key, m_tail_lsn.load(std::memory_order_relaxed)); + auto const tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); + + // If the store is empty, return out_of_bound_ld_key as trunc_ld_key, allowing the caller to truncate freely. + // Otherwise, return the actual trunc_ld_key. + return (trunc_lsn == tail_lsn) ? std::make_tuple(trunc_lsn, logdev_key::out_of_bound_ld_key(), tail_lsn) + : std::make_tuple(trunc_lsn, m_trunc_ld_key, tail_lsn); } void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { @@ -276,10 +296,7 @@ void HomeLogStore::flush(logstore_seq_num_t upto_lsn) { return; } - if (upto_lsn == invalid_lsn()) { upto_lsn = m_records.active_upto(); } - - // if we have flushed already, we are done, else issue a flush - if (m_records.status(upto_lsn).is_active) m_logdev->flush_under_guard(); + m_logdev->flush_under_guard(); } bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index f4e492f29..f89745107 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -359,12 +359,10 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) { // release this assert if for some use case, we should tolorant this case; // for now, don't expect this case to happen. // RELEASE_ASSERT(false, "compact_lsn={} is beyond the current max_lsn={}", compact_lsn, cur_max_lsn); - REPL_STORE_LOG(DEBUG, "Adding dummy entries during compact from={} upto={}", cur_max_lsn + 1, - to_store_lsn(compact_lsn)); - // We need to fill the remaining entries with dummy data. - for (auto lsn{cur_max_lsn + 1}; lsn <= to_store_lsn(compact_lsn); ++lsn) { - append(m_dummy_log_entry); - } + + // if compact_lsn is beyond the current max_lsn, it indicates a hole from cur_max_lsn to compact_lsn. + // we directly compact and truncate up to compact_lsn assuming there are dummy logs. + REPL_STORE_LOG(DEBUG, "Compact with log holes from {} to={}", cur_max_lsn + 1, to_store_lsn(compact_lsn)); } m_log_store->truncate(to_store_lsn(compact_lsn)); return true; diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index af9173d9f..c678aec43 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -200,10 +200,11 @@ class LogDevTest : public ::testing::Test { read_all_verify(log_store); } - void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* last_lsn = nullptr) { + void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* trunc_lsn = nullptr) { auto upto = log_store->get_contiguous_completed_seq_num(-1); - if (last_lsn) { - ASSERT_EQ(upto, *last_lsn); + if (trunc_lsn && *trunc_lsn != upto) { + LOGWARN("Truncate issued upto {} but real upto lsn in log store is {}", *trunc_lsn, upto); + upto = *trunc_lsn; } LOGINFO("truncate_validate upto {}", upto); @@ -216,6 +217,24 @@ class LogDevTest : public ::testing::Test { auto actual_count = log_store->get_logdev()->log_dev_meta().num_rollback_records(log_store->get_store_id()); ASSERT_EQ(actual_count, expected_count); } + + logid_t get_last_truncate_idx(logdev_id_t logdev_id) { + auto status = logstore_service().get_logdev(logdev_id)->get_status(0); + if (status.contains("last_truncate_log_idx")) { + return s_cast(status["last_truncate_log_idx"]); + } + LOGERROR("Failed to get last_truncate_log_idx from logdev status for logdev_id {}", logdev_id); + return static_cast(-1); + } + + logid_t get_current_log_idx(logdev_id_t logdev_id) { + auto status = logstore_service().get_logdev(logdev_id)->get_status(0); + if (status.contains("current_log_idx")) { + return s_cast(status["current_log_idx"]); + } + LOGERROR("Failed to get current_log_idx from logdev status for logdev_id {}", logdev_id); + return static_cast(-1); + } }; TEST_F(LogDevTest, WriteSyncThenRead) { @@ -313,7 +332,6 @@ TEST_F(LogDevTest, ReTruncate) { auto logdev_id = logstore_service().create_new_logdev(); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); - auto store_id = log_store->get_store_id(); LOGINFO("Step 2: Issue sequential inserts with q depth of 10"); logstore_seq_num_t cur_lsn = 0; @@ -336,6 +354,268 @@ TEST_F(LogDevTest, ReTruncate) { read_all_verify(log_store); } +TEST_F(LogDevTest, TruncateWithExceedingLSN) { + LOGINFO("Step 1: Create a single logstore to start truncate with exceeding LSN test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + + LOGINFO("Step 2: Insert 500 entries"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 4: Truncate 100 entries"); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 499); + ASSERT_EQ(log_store->next_lsn(), 500); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: Truncate all with exceeding lsn"); + trunc_lsn = 1999999; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), trunc_lsn); + ASSERT_EQ(log_store->next_lsn(), 2000000); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 7 Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 8: Append 500 entries"); + cur_lsn = log_store->next_lsn(); + kickstart_inserts(log_store, cur_lsn, 500); + ASSERT_EQ(log_store->next_lsn(), 2000500); + + LOGINFO("Step 9: Read and verify all entries"); + read_all_verify(log_store); +} + +TEST_F(LogDevTest, TruncateAfterRestart) { + LOGINFO("Step 1: Create a single logstore to start truncate with overlapping LSN test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + auto restart = [&]() { + std::promise< bool > p; + auto starting_cb = [&]() { + logstore_service().open_logdev(logdev_id); + logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { + log_store = store; + p.set_value(true); + }); + }; + start_homestore(true /* restart */, starting_cb); + p.get_future().get(); + }; + + LOGINFO("Step 2: Insert 500 entries"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 4: Truncate 100 entries"); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 499); + ASSERT_EQ(log_store->next_lsn(), 500); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: Restart and verify all entries"); + restart(); + read_all_verify(log_store); + auto const [last_trunc_lsn, trunc_ld_key, tail_lsn] = log_store->truncate_info(); + ASSERT_EQ(last_trunc_lsn, trunc_lsn); + ASSERT_EQ(trunc_ld_key.idx, 0); + ASSERT_EQ(tail_lsn, log_store->tail_lsn()); + + LOGINFO("Step 7: call log dev truncate again and read verify") + logstore_service().device_truncate(); + read_all_verify(log_store); +} + +TEST_F(LogDevTest, TruncateAcrossMultipleStores) { + LOGINFO("Step 1: Create 3 log stores to start truncate across multiple stores test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto store1 = logstore_service().create_new_log_store(logdev_id, false); + auto store2 = logstore_service().create_new_log_store(logdev_id, false); + auto store3 = logstore_service().create_new_log_store(logdev_id, false); + + + LOGINFO("Step 2: Insert 100 entries to store {}", store1->get_store_id()); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(store1, cur_lsn, 100); + ASSERT_EQ(get_current_log_idx(logdev_id), 100); + + LOGINFO("Step 3: Insert 200 entries to store {}", store2->get_store_id()); + cur_lsn = 0; + kickstart_inserts(store2, cur_lsn, 200); + ASSERT_EQ(get_current_log_idx(logdev_id), 300); + + LOGINFO("Step 4: Insert 200 entries to store {}", store3->get_store_id()); + cur_lsn = 0; + kickstart_inserts(store3, cur_lsn, 200); + ASSERT_EQ(get_current_log_idx(logdev_id), 500); + + LOGINFO("Step 5: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 0); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), -1); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 0); + ASSERT_EQ(store3->tail_lsn(), 199); + ASSERT_EQ(store3->truncated_upto(), -1); + // log dev should not truncate any logs due to no truncate in log stores happened + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 6: Truncate 100 entries in store {}", store2->get_store_id()); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(store2, &trunc_lsn); + + LOGINFO("Step 7: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 0); + ASSERT_EQ(store3->tail_lsn(), 199); + ASSERT_EQ(store3->truncated_upto(), -1); + // log dev should not truncate any logs due to store1 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 8: Truncate 500 entries in store {}", store3->get_store_id()); + trunc_lsn = 499; + truncate_validate(store3, &trunc_lsn); + + LOGINFO("Step 9: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate not truncate any logs due to store1 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 10: Truncate 100 entries in store {}", store1->get_store_id()); + trunc_lsn = 99; + truncate_validate(store1, &trunc_lsn); + + LOGINFO("Step 11: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate logs upto 199, as store2 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), 199); + + LOGINFO("Step 12: Truncate 300 entries in store {}", store2->get_store_id()); + trunc_lsn = 299; + truncate_validate(store2, &trunc_lsn); + + LOGINFO("Step 13: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate all logs as all stores are empty + ASSERT_EQ(get_last_truncate_idx(logdev_id), 499); + + LOGINFO("Step 14: Insert 100 entries in store {}", store1->get_store_id()); + cur_lsn = 100; + kickstart_inserts(store1, cur_lsn, 100); + ASSERT_EQ(get_current_log_idx(logdev_id), 600); + + LOGINFO("Step 15: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 199); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should not truncate since no new truncate happened + ASSERT_EQ(get_last_truncate_idx(logdev_id), 499); + + LOGINFO("Step 16: Truncate 500 entries in store {}", store1->get_store_id()); + trunc_lsn = 499; + truncate_validate(store1, &trunc_lsn); + + LOGINFO("Step 17: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 500); + ASSERT_EQ(store1->tail_lsn(), 499); + ASSERT_EQ(store1->truncated_upto(), 499); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // make sure new logs can truncate successfully when there are empty log stores + ASSERT_EQ(get_last_truncate_idx(logdev_id), 599); +} + TEST_F(LogDevTest, CreateRemoveLogDev) { auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); std::vector< std::shared_ptr< HomeLogStore > > log_stores; From 98bccc0a9643710d80130d922795e034252abf1c Mon Sep 17 00:00:00 2001 From: Hooper <62418134+Hooper9973@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:33:01 +0800 Subject: [PATCH 053/130] Adjust cp_io num_fiber to Prevent Deadlock (#630) Description: Resolved a potential deadlock issue with sync_io fibers. When multiple sync_io fibers are active, a fiber (e.g., fiber1) may acquire a thread-level mutex and perform synchronous I/O using io_uring. This causes fiber1 to call boost::fibers::promise::get_future(), blocking itself and allowing other fibers in the same thread to be scheduled. If another fiber (e.g., fiber2) is scheduled and attempts to acquire the same mutex, a deadlock occurs. By adjusting the num_fiber in cp_io, we prevent this deadlock scenario. --- conanfile.py | 2 +- src/lib/checkpoint/cp_mgr.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 32cbdc7d6..60fbade11 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.10" + version = "6.6.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index c14015c06..98f0f7cbb 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -309,8 +309,12 @@ void CPManager::start_cp_thread() { }; auto ctx = std::make_shared< Context >(); - // Start a reactor with 9 fibers (8 for sync io) - iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 8u, [this, ctx](bool is_started) { + // Start a reactor with 2 fibers (1 for sync io) + // Prevent deadlock with sync_io fibers. + // Multiple sync_io fibers may acquire a thread-level mutex and perform synchronous I/O using io_uring. + // This can block the fiber and allow other fibers to be scheduled. + // If another fiber tries to acquire the same mutex, a deadlock can occur. + iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 2u, [this, ctx](bool is_started) { if (is_started) { { std::unique_lock< std::mutex > lk{ctx->mtx}; From 9ece3740944f23079b4f6d9e7a38166886f150ae Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 16 Jan 2025 15:53:30 +0800 Subject: [PATCH 054/130] Ensure end_of_append_batch is Called for All Raft Log Types Previously, HomeRaftLogStore::end_of_append_batch was only invoked for app_logs, which required requests in m_lsn_req_map. This behavior caused issues when only non-app raft logs (e.g., conf logs) were appended, as the function would not be called, leaving m_last_durable_lsn outdated. Consequently, next_slot() could return incorrect values based on the stale m_last_durable_lsn. This update ensures that HomeRaftLogStore::end_of_append_batch is called for all raft log types, guaranteeing that all logs are flushed and m_last_durable_lsn is consistently updated everytime log_store's end_of_append_batch is executed --- conanfile.py | 2 +- .../replication/log_store/repl_log_store.cpp | 44 +++++++++---------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/conanfile.py b/conanfile.py index 60fbade11..98df7178f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.11" + version = "6.6.12" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 97d70ff92..072d06b99 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -44,11 +44,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { auto proposer_reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { auto rreq = m_sm.lsn_to_req(lsn); - // Skip this call in proposer, since this method will synchronously flush the data, which is not required for - // leader. Proposer will call the flush as part of commit after receiving quorum, upon which time, there is a - // high possibility the log entry is already flushed. Skip it for rreq == nullptr which is the case for raft - // config entries. - if ((rreq == nullptr) /*|| rreq->is_proposer()*/) { + // Skip it for rreq == nullptr which is the case for raft config entries. + if ((rreq == nullptr)) { continue; } else if (rreq->is_proposer()) { proposer_reqs->emplace_back(std::move(rreq)); @@ -60,41 +57,40 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { RD_LOGT("Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", start_lsn, count, reqs->size(), proposer_reqs->size()); - // All requests are from proposer for data write, so as mentioned above we can skip the flush for now if (!reqs->empty()) { // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and // the future will be ready. + auto cur_time = std::chrono::steady_clock::now(); auto fut = m_rd.notify_after_data_written(reqs); // Wait for the fetch and write to be completed successfully. // It is essential to complete the data write before appending to the log. If the logs are flushed // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost, // it leaves us uncertain about whether the data was actually written, potentially leading to data inconsistency. std::move(fut).wait(); + HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); + } - // Flushing log now. - auto cur_time = std::chrono::steady_clock::now(); - HomeRaftLogStore::end_of_append_batch(start_lsn, count); - HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); + // Flushing logs now. + auto cur_time = std::chrono::steady_clock::now(); + HomeRaftLogStore::end_of_append_batch(start_lsn, count); + HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); - cur_time = std::chrono::steady_clock::now(); - HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); + // Mark all the reqs completely written + for (auto const& rreq : *reqs) { + if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } + } - // Mark all the reqs also completely written - for (auto const& rreq : *reqs) { - if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } - } - } else if (!proposer_reqs->empty()) { - RD_LOGT("Raft Channel: end_of_append_batch, I am proposer, only flush log s from {} , count {}", start_lsn, - count); - // Mark all the reqs also completely written - HomeRaftLogStore::end_of_append_batch(start_lsn, count); - for (auto const& rreq : *proposer_reqs) { - if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } + // Data corresponding to proposer reqs have already been written before propose reqs to raft, + // so skip waiting data written and mark reqs as flushed here. + for (auto const& rreq : *proposer_reqs) { + if (rreq) { + RD_LOGT("Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it", rreq->lsn()); + rreq->add_state(repl_req_state_t::LOG_FLUSHED); } } - // Convert volatile logs to non-volatile logs in state machine + // Convert volatile logs to non-volatile logs in state machine. for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { auto rreq = m_sm.lsn_to_req(lsn); if (rreq != nullptr) { From da97daa2904ddb7ed0cce4edd003edf89273a0cc Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 2 Jan 2025 17:52:53 +0800 Subject: [PATCH 055/130] Fix logic for setting flush_ld_key while loading logs, makes it consistent with the logic in on_write_completion This change prevents truncating more logs than expected. --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 3 +- src/tests/test_log_dev.cpp | 82 ++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 98df7178f..34bdd33e7 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.12" + version = "6.6.13" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index be5cd976a..9a748db56 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -203,8 +203,7 @@ void LogDev::do_load(off_t device_cursor) { // Loop through each record within the log group and do a callback decltype(header->nrecords()) i{0}; HS_REL_ASSERT_GT(header->nrecords(), 0, "nrecords greater then zero"); - const auto flush_ld_key = - logdev_key{header->start_idx() + header->nrecords(), group_dev_offset + header->total_size()}; + const auto flush_ld_key = logdev_key{header->start_idx(), group_dev_offset}; while (i < header->nrecords()) { const auto* rec = header->nth_record(i); const uint32_t data_offset = (rec->offset + (rec->get_inlined() ? 0 : header->oob_data_offset)); diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index c678aec43..753bb63c9 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -157,6 +157,31 @@ class LogDevTest : public ::testing::Test { } } + void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, uint32_t fixed_size = 0) { + bool io_memory{false}; + std::vector data_vector; + + for (int64_t i = 0; i < batch; ++i) { + auto* d = prepare_data(lsn + i, io_memory, fixed_size); + data_vector.push_back(d); // Store the pointer in the vector + log_store->write_async(lsn + i, {uintptr_cast(d), d->total_size(), false}, nullptr, nullptr); + LOGINFO("Written async data for LSN -> {}:{}", log_store->get_store_id(), lsn + i); + } + + log_store->flush(); + LOGINFO("Flush data from {} to {}", lsn, lsn + batch); + lsn += batch; + + // Free all the allocated memory after the batch insert + for (auto* d : data_vector) { + if (io_memory) { + iomanager.iobuf_free(uintptr_cast(d)); + } else { + std::free(voidptr_cast(d)); + } + } + } + void kickstart_inserts(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& cur_lsn, int64_t batch, uint32_t fixed_size = 0) { auto last = cur_lsn + batch; @@ -616,6 +641,63 @@ TEST_F(LogDevTest, TruncateAcrossMultipleStores) { ASSERT_EQ(get_last_truncate_idx(logdev_id), 599); } +TEST_F(LogDevTest, TruncateLogsAfterFlushAndRestart) { + LOGINFO("Step 1: Create a single logstore to start truncate-logs-after-flush-and-restart test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + auto restart = [&]() { + std::promise < bool > p; + auto starting_cb = [&]() { + logstore_service().open_logdev(logdev_id); + logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { + log_store = store; + p.set_value(true); + }); + }; + start_homestore(true /* restart */, starting_cb); + p.get_future().get(); + }; + + LOGINFO("Step 2: Insert 100 entries"); + logstore_seq_num_t cur_lsn = 0; + insert_batch_sync(log_store, cur_lsn, 100, 0); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 99); + + LOGINFO("Step 4: Append 100 entries"); + insert_batch_sync(log_store, cur_lsn, 100, 0); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: restart and verify"); + restart(); + read_all_verify(log_store); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 7: Truncate 50 entries"); + logstore_seq_num_t trunc_lsn = 49; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 199); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 8: restart and verify"); + restart(); + read_all_verify(log_store); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 199); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); +} + TEST_F(LogDevTest, CreateRemoveLogDev) { auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); std::vector< std::shared_ptr< HomeLogStore > > log_stores; From b21921e22fcba9b8a2f826142893237a29a1c4d9 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Wed, 22 Jan 2025 21:38:27 +0800 Subject: [PATCH 056/130] Add flush meta for single log store --- conanfile.py | 2 +- src/lib/replication/log_store/home_raft_log_store.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 34bdd33e7..76fb40595 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.13" + version = "6.6.14" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index f89745107..d6f477ad5 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -364,7 +364,7 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) { // we directly compact and truncate up to compact_lsn assuming there are dummy logs. REPL_STORE_LOG(DEBUG, "Compact with log holes from {} to={}", cur_max_lsn + 1, to_store_lsn(compact_lsn)); } - m_log_store->truncate(to_store_lsn(compact_lsn)); + m_log_store->truncate(to_store_lsn(compact_lsn), false); return true; } From 3634a2fa86bccb7a9b517c5e1a7ca50a68598192 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 23 Jan 2025 14:13:58 +0800 Subject: [PATCH 057/130] Rename apply_snp_resync_data to save_snp_resync_data - apply_snp_resync_data is similar to apply_snapshot in raft --- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- src/lib/replication/repl_dev/raft_state_machine.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 3fd68ee24..b8ea8a8fd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1570,7 +1570,7 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { std::memcpy(data_out->data_begin(), &msg, msg_size); } -bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { +bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data) { auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 5e66e18f8..619da7843 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -324,7 +324,7 @@ class RaftReplDev : public ReplDev, void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); void create_snp_resync_data(raft_buf_ptr_t& data_out); - bool apply_snp_resync_data(nuraft::buffer& data); + bool save_snp_resync_data(nuraft::buffer& data); }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 5e98766df..a09afc46c 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -352,9 +352,9 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, bool is_last_obj) { if (is_hs_snp_obj(obj_id)) { // Homestore preserved msg - if (m_rd.apply_snp_resync_data(data)) { + if (m_rd.save_snp_resync_data(data)) { obj_id = snp_obj_id_type_app; - LOGDEBUG("apply_snp_resync_data success, next obj_id={}", obj_id); + LOGDEBUG("save_snp_resync_data success, next obj_id={}", obj_id); } return; } From 2294ac777a2afb5a326c670b4e6cfe29374eda16 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Fri, 24 Jan 2025 16:03:44 +0800 Subject: [PATCH 058/130] Fix bugs in snapshot transmission (#632) * Create a snapshot after adding a new member to prevent transmitting a snapshot with outdated configuration. * Trigger cp_flush on last_obj in case apply_snapshot() is skipped due to crash. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 9 +++++++++ src/lib/replication/repl_dev/raft_state_machine.cpp | 11 ++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index 76fb40595..9947d0464 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.14" + version = "6.6.15" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b8ea8a8fd..7508139ee 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -178,6 +178,15 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), group_id_str()); + // If enabled, create a snapshot here to ensure the new member will use the latest snapshot with itself in the config + if (raft_server()->get_current_params().snapshot_distance_ > 0) { + if (auto idx = raft_server()->create_snapshot(); idx > 0) { + RD_LOGI("Created snapshot idx={} after adding member", idx); + } else { + RD_LOGW("Failed to create snapshot after adding member"); + } + } + // Step 3. Append log entry to mark the old member is out and new member is added. auto rreq = repl_req_ptr_t(new repl_req_ctx{}); replace_members_ctx members; diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index a09afc46c..710a56316 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -365,11 +365,14 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, snp_data->is_last_obj = is_last_obj; // We are doing a copy here. - sisl::io_blob_safe blob{s_cast< size_t >(data.size())}; + sisl::io_blob_safe blob{static_cast(data.size())}; std::memcpy(blob.bytes(), data.data_begin(), data.size()); snp_data->blob = std::move(blob); m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data); + if (is_last_obj) { + hs()->cp_mgr().trigger_cp_flush(true).wait(); // ensure DSN is flushed to disk + } // Update the object offset. obj_id = snp_data->offset; @@ -378,17 +381,19 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) { LOGINFO("Hit flip baseline_resync_restart_new_follower crashing"); hs()->crash_simulator().crash(); - return; } #endif } bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { + // NOTE: Currently, NuRaft considers the snapshot applied once compaction and truncation are completed, even if a + // crash occurs before apply_snapshot() is called. Therefore, the LSN must be updated here to ensure it is + // persisted AFTER log truncation. m_rd.set_last_commit_lsn(s.get_last_log_idx()); m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx()); + auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); auto res = m_rd.m_listener->apply_snapshot(snp_ctx); - // make sure the changes are flushed. hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); return res; } From 727becdbaaaf75d7e791b5bf4341da9c9c003e3a Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 4 Feb 2025 14:33:00 -0800 Subject: [PATCH 059/130] Change long index setting (#640) --- conanfile.py | 2 +- src/tests/test_scripts/index_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 9947d0464..07674b5c4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.15" + version = "6.6.16" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index d4734ac82..564bd61c5 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -78,7 +78,7 @@ def long_runnig_index(options, type=0): def long_running_clean_shutdown(options, type=0): print("Long running clean shutdown started") - + options['run_time'] = options['run_time'] // 10 try: run_test(options, type) options['init_device'] = False From 3ff623e5edb8e9c641f9020db7cf1e4e88f56f47 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Sat, 8 Feb 2025 10:08:26 +0800 Subject: [PATCH 060/130] Persist logstore superblock when logdev truncation is unnecessary This change ensures that the logstore can get the correct start LSN during recovery. Avoid the following scenario: T1: Follower1 appends logs up to 100, then is stopped by a sigkill. T2: Upon restart, since the leader's log range is 1000-2500, a baseline resync is triggered using snapshot 2000. T3: Follower1 completes the baseline resync (start_lsn=2001, tail_lsn=2000), but m_trunc_ld_key is not updated since we cannot get a valid device offset for LSN 2000. T4: Follower1 appends logs from 2001 to 2500, making tail_lsn greater than 2000. T5: During logdev truncation, the truncation info is found at first. Since trunc_lsn < tail_lsn, it returns m_trunc_ld_key (still {0,0}), then exits without persist the logstore sb. T6: Follower1 is killed again, and upon restart, its start index in the store superblock remains 0, incorrectly interpreting the range as [1,2500]. --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 07674b5c4..27e72adef 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.16" + version = "6.6.17" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 9a748db56..d932d68c5 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -540,7 +540,18 @@ uint64_t LogDev::truncate() { } // There are no writes or no truncation called for any of the store, so we can't truncate anything - if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; + if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) { + // Persist the logstore superblock to ensure correct start LSN during recovery. Avoid such scenario: + // 1. Follower1 appends logs up to 100, then is stopped by a sigkill. + // 2. Upon restart, a baseline resync is triggered using snapshot 2000. + // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a valid + // device offset for LSN 2000 to update it. + // 4. Follower1 appends logs from 2001 to 2500, making tail_lsn > 2000. + // 5. Get m_trunc_ld_key={0,0}, goto here and return 0 without persist. + // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as [1,2500]. + m_logdev_meta.persist(); + return 0; + } uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx); From e68db54dcd6053e00a877a1607661a359177d4e3 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Sat, 8 Feb 2025 17:59:23 +0800 Subject: [PATCH 061/130] Add timeout cfg for snapshot sync context --- src/lib/common/homestore_config.fbs | 6 +++++- src/lib/replication/service/raft_repl_service.cpp | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 81b67b5b9..984a471e2 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -279,6 +279,10 @@ table Consensus { // ReplDev Reqs timeout in seconds. repl_req_timeout_sec: uint32 = 300; + // Timeout for snapshot sync context in ms. If the follower doesn't response + // within this timeout during snapshot resync, the leader will release snapshot sync context. + snapshot_sync_ctx_timeout_ms: int32 = 60000; + // Frequency to flush durable commit LSN in millis flush_durable_commit_interval_ms: uint64 = 500; @@ -291,7 +295,7 @@ table Consensus { // Reading snapshot objects will be done by a background thread asynchronously // instead of synchronous read by Raft worker threads - use_bg_thread_for_snapshot_io_: bool = true; + use_bg_thread_for_snapshot_io: bool = true; } table HomeStoreSettings { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 23ff2db89..6206c3dde 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -105,12 +105,13 @@ void RaftReplService::start() { .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) .with_leadership_expiry(HS_DYNAMIC_CONFIG(consensus.leadership_expiry_ms)) .with_reserved_log_items(HS_DYNAMIC_CONFIG(consensus.num_reserved_log_items)) + .with_snapshot_sync_ctx_timeout(HS_DYNAMIC_CONFIG(consensus.snapshot_sync_ctx_timeout_ms)) .with_auto_forwarding(false); // new_joiner_type fully disabled log pack behavior. // There is no callback available for handling and localizing the log entries within the pack, which could // result in data corruption. r_params.use_new_joiner_type_ = true; - r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io_); + r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io); r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); From 7ae04cef557347cd5c0395e3b758b9e24ff73e97 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Tue, 11 Feb 2025 21:38:23 +0800 Subject: [PATCH 062/130] Remove snapshot creation when add_member done Nuraft Reconfigure issue has been fixed, we don't need to create snapshot. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/conanfile.py b/conanfile.py index 27e72adef..5bc2c8576 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.17" + version = "6.6.18" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 7508139ee..b8ea8a8fd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -178,15 +178,6 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), group_id_str()); - // If enabled, create a snapshot here to ensure the new member will use the latest snapshot with itself in the config - if (raft_server()->get_current_params().snapshot_distance_ > 0) { - if (auto idx = raft_server()->create_snapshot(); idx > 0) { - RD_LOGI("Created snapshot idx={} after adding member", idx); - } else { - RD_LOGW("Failed to create snapshot after adding member"); - } - } - // Step 3. Append log entry to mark the old member is out and new member is added. auto rreq = repl_req_ptr_t(new repl_req_ctx{}); replace_members_ctx members; From 02af6cb224bc2518c86443d02a40fde4932c2419 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Wed, 12 Feb 2025 16:54:43 +0800 Subject: [PATCH 063/130] Add function to support purging existing logs. (#643) This change is necessary for baseline resync and can be called by the upper layer to purge existing logs, which resolves the following issue: If a follower restarts during baseline resync, it will replay the remaining logs first. However, shard info has already been cleared at the beginning of resync (from the HO side), making it impossible to retrieve shard info while replaying logs, which results in errors. Co-authored-by: yawzhang --- src/include/homestore/replication/repl_dev.h | 3 +++ src/lib/replication/log_store/home_raft_log_store.cpp | 7 +++++++ src/lib/replication/log_store/home_raft_log_store.h | 6 ++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 4 ++++ src/lib/replication/repl_dev/solo_repl_dev.h | 1 + 5 files changed, 21 insertions(+) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index d05be3fde..937450336 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -477,6 +477,9 @@ class ReplDev { /// @return true if ready, false otherwise virtual bool is_ready_for_traffic() const = 0; + /// @brief Clean up resources on this repl dev. + virtual void purge() = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index d6f477ad5..e44b94463 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -378,6 +378,13 @@ ulong HomeRaftLogStore::last_durable_index() { return to_repl_lsn(m_last_durable_lsn); } +void HomeRaftLogStore::purge_all_logs() { + auto last_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); + REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", + m_logstore_id, m_logdev_id, last_lsn); + m_log_store->truncate(last_lsn, false /* in_memory_truncate_only */); +} + void HomeRaftLogStore::wait_for_log_store_ready() { m_log_store_future.wait(); } void HomeRaftLogStore::set_last_durable_lsn(repl_lsn_t lsn) { m_last_durable_lsn = to_store_lsn(lsn); } diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index d2c0fd57b..7fb96a5d4 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -215,6 +215,12 @@ class HomeRaftLogStore : public nuraft::log_store { void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn); #endif + /** + * Purge all logs in the log store + * It is a dangerous operation and is only used in baseline resync now (purge all logs and restore by snapshot). + */ + void purge_all_logs(); + void wait_for_log_store_ready(); void set_last_durable_lsn(repl_lsn_t lsn); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 619da7843..b6cd9d744 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -198,6 +198,10 @@ class RaftReplDev : public ReplDev, if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } return ready; } + void purge() override { + // clean up existing logs in log store + m_data_journal->purge_all_logs(); + } //////////////// Accessor/shortcut methods /////////////////////// nuraft_mesg::repl_service_ctx* group_msg_service(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index e5f33fb63..f252dd209 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -54,6 +54,7 @@ class SoloReplDev : public ReplDev { return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}}; } bool is_ready_for_traffic() const override { return true; } + void purge() override {} uuid_t group_id() const override { return m_group_id; } From 4447575fb59b7dcca5feb8418589bada3d56215f Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 13 Feb 2025 14:32:26 +0800 Subject: [PATCH 064/130] Improve active peer determination logic Exclude the possibility of a peer performing baseline resync to avoid potential conflicts. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 5bc2c8576..a7367c5d1 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.18" + version = "6.6.19" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b8ea8a8fd..ed3a1a4a2 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1088,13 +1088,17 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) : 0; + // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log entries to it + // and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal replication. + least_active_repl_idx = std::max(least_active_repl_idx, m_data_journal->start_index() - 1); for (auto p : repl_status) { if (p.id_ == m_my_repl_id) { continue; } if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); } else { - RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}", p.id_, - my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_); + RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", + p.id_, + my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx); } } return res; From 895fe55489fe7c89fb3221190f60122c72d62de4 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 11 Feb 2025 19:58:37 +0800 Subject: [PATCH 065/130] Remove optimization on blk free operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization in blk free may cause the following issue, just remove it and wait for GC handling: - T1: blob1 is written with LSN 1 — [blkid=10, chunk=1, cnt=5]. - T2: blob1 is deleted with LSN 10, causing the last_append_offset to revert to 10. - T3: blob2 is written with LSN 11 — [blkid=10, chunk=1, cnt=5]. - T4: The SM is terminated and restarted. - T5: LSN 1 is replayed, committing block [blkid=10, chunk=1, cnt=5]. - T6: LSN 11 is replayed, committing block [blkid=10, chunk=1, cnt=5]. - T7: LSN 10 is committed, freeing block [blkid=10, chunk=1, cnt=5]. - T8: LSN 11 is committed again, but since the blocks have already been freed, they are not available for LSN 11. --- conanfile.py | 2 +- src/lib/blkalloc/append_blk_allocator.cpp | 28 ++--------------------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/conanfile.py b/conanfile.py index a7367c5d1..aa1cda3b4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.19" + version = "6.6.20" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 1380a5ff6..eca445381 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -127,33 +127,9 @@ void AppendBlkAllocator::cp_flush(CP* cp) { } } -// -// free operation does: -// 1. book keeping "total freeable" space -// 2. if the blk being freed happens to be last block, move last_append_offset backwards accordingly; -// +// free operation books keeping "total freeable" space void AppendBlkAllocator::free(const BlkId& bid) { - // If we are freeing the last block, just move the offset back - blk_num_t cur_last_offset = m_last_append_offset.load(); - auto const input_last_offset = bid.blk_num() + bid.blk_count(); - blk_num_t new_last_offset; - bool freeing_in_middle{false}; - do { - if (input_last_offset == cur_last_offset) { - new_last_offset = bid.blk_num(); - freeing_in_middle = false; - } else { - new_last_offset = cur_last_offset; - freeing_in_middle = true; - } - } while (!m_last_append_offset.compare_exchange_weak(cur_last_offset, new_last_offset)); - - if (freeing_in_middle) { - // Freeing something in the middle, increment the count - m_freeable_nblks.fetch_add(bid.blk_count()); - } else { - m_commit_offset.store(m_last_append_offset.load()); - } + m_freeable_nblks.fetch_add(bid.blk_count()); m_is_dirty.store(true); } From ee6d34752092fdc41e4147f87a08a17cd8212b68 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Thu, 13 Feb 2025 16:12:42 -0700 Subject: [PATCH 066/130] Return grpc error if a non originator receives fetch data request --- .../replication/repl_dev/raft_repl_dev.cpp | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index ed3a1a4a2..2d7da6c72 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -776,32 +776,38 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ auto const& originator = req->blkid_originator(); auto const& remote_blkid = req->remote_blkid(); - // release this assert if in the future we want to fetch from non-originator; - RD_REL_ASSERT_EQ(originator, server_id(), - "Not expect to receive fetch data from remote when I am not the originator of this request"); + // Edit this check if in the future we want to fetch from non-originator; + if (originator != server_id()) { + auto const error_msg = fmt::format("Did not expect to receive fetch data from " + "remote when I am not the originator of this request, originator={}, my_server_id={}" + , originator, server_id()); + RD_LOGW("{}", error_msg); + auto status = ::grpc::Status(::grpc::INVALID_ARGUMENT, error_msg); + rpc_data->set_status(status); + rpc_data->send_response(); + return; + } // fetch data based on the remote_blkid - if (originator == server_id()) { - // We are the originator of the blkid, read data locally; - MultiBlkId local_blkid; - - // convert remote_blkid serialized data to local blkid - local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); - - RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn, - local_blkid.to_string()); - - // prepare the sgs data buffer to read into; - auto const total_size = local_blkid.blk_count() * get_blk_size(); - sisl::sg_list sgs; - sgs.size = total_size; - sgs.iovs.emplace_back( - iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size}); - - // accumulate the sgs for later use (send back to the requester)); - sgs_vec.push_back(sgs); - futs.emplace_back(async_read(local_blkid, sgs, total_size)); - } + // We are the originator of the blkid, read data locally; + MultiBlkId local_blkid; + + // convert remote_blkid serialized data to local blkid + local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); + + RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn, + local_blkid.to_string()); + + // prepare the sgs data buffer to read into; + auto const total_size = local_blkid.blk_count() * get_blk_size(); + sisl::sg_list sgs; + sgs.size = total_size; + sgs.iovs.emplace_back( + iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size}); + + // accumulate the sgs for later use (send back to the requester)); + sgs_vec.push_back(sgs); + futs.emplace_back(async_read(local_blkid, sgs, total_size)); } folly::collectAllUnsafe(futs).thenValue( From a3a36f9f2adbb31b8d6a173455c7b939677ebe0d Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Fri, 21 Feb 2025 09:38:11 +0800 Subject: [PATCH 067/130] Modify snapshot_context structure for persistency (#650) Previous snapshot_context interface is insufficient for decoupling homestore from customers on snapshot implementations. This commit replaces deserialize() with a virtual function of ReplDev for constructing particular snapshot_context instance from byte buffer. --- conanfile.py | 2 +- src/include/homestore/homestore_decl.hpp | 1 + src/include/homestore/replication/repl_dev.h | 39 +++----------------- src/lib/replication/repl_dev/raft_repl_dev.h | 39 +++++++++++++++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 4 ++ 5 files changed, 49 insertions(+), 36 deletions(-) diff --git a/conanfile.py b/conanfile.py index aa1cda3b4..df765c4d2 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.20" + version = "6.6.22" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 3d1f75135..db5bfd24c 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 937450336..6c1103a2a 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -72,14 +72,13 @@ struct repl_key { using repl_snapshot = nuraft::snapshot; using repl_snapshot_ptr = nuraft::ptr< nuraft::snapshot >; -// Consumers of the ReplDevListener dont have to know what underlying -// snapshot implementation is used. Consumers can export and save the state -// of the snapshot using serialize and load the state using deserialize. +// Consumers of ReplDevListener don't have to know what underlying snapshot context implementation is used by the +// ReplDev. The state of the snapshot can be exported with serialize() and loaded with +// repl_dev.deserialize_snapshot_context(). class snapshot_context { public: snapshot_context(int64_t lsn) : lsn_(lsn) {} virtual ~snapshot_context() = default; - virtual void deserialize(const sisl::io_blob_safe& snp_ctx) = 0; virtual sisl::io_blob_safe serialize() = 0; int64_t get_lsn() { return lsn_; } @@ -87,36 +86,6 @@ class snapshot_context { int64_t lsn_; }; -class nuraft_snapshot_context : public snapshot_context { -public: - nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) { - auto snp_buf = snp.serialize(); - snapshot_ = nuraft::snapshot::deserialize(*snp_buf); - } - - void deserialize(const sisl::io_blob_safe& snp_ctx) override { - // Load the context from the io blob to nuraft buffer. - auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); - nuraft::buffer_serializer bs(snp_buf); - bs.put_raw(snp_ctx.cbytes(), snp_ctx.size()); - snapshot_ = nuraft::snapshot::deserialize(bs); - lsn_ = snapshot_->get_last_log_idx(); - } - - sisl::io_blob_safe serialize() override { - // Dump the context from nuraft buffer to the io blob. - auto snp_buf = snapshot_->serialize(); - sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())}; - std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); - return blob; - } - - nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; } - -private: - nuraft::ptr< nuraft::snapshot > snapshot_; -}; - struct snapshot_obj { void* user_ctx{nullptr}; uint64_t offset{0}; @@ -480,6 +449,8 @@ class ReplDev { /// @brief Clean up resources on this repl dev. virtual void purge() = 0; + virtual std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index b6cd9d744..1d2e4ea55 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -113,6 +113,38 @@ struct ReplDevCPContext { uint64_t last_applied_dsn; }; +class nuraft_snapshot_context : public snapshot_context { +public: + nuraft_snapshot_context(nuraft::snapshot &snp) : snapshot_context(snp.get_last_log_idx()) { + auto snp_buf = snp.serialize(); + snapshot_ = nuraft::snapshot::deserialize(*snp_buf); + } + + nuraft_snapshot_context(sisl::io_blob_safe const &snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); } + + sisl::io_blob_safe serialize() override { + // Dump the context from nuraft buffer to the io blob. + auto snp_buf = snapshot_->serialize(); + sisl::io_blob_safe blob{s_cast(snp_buf->size())}; + std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); + return blob; + } + + void deserialize(const sisl::io_blob_safe &snp_ctx) { + // Load the context from the io blob to nuraft buffer. + auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); + snp_buf->put_raw(snp_ctx.cbytes(), snp_ctx.size()); + snp_buf->pos(0); + snapshot_ = nuraft::snapshot::deserialize(*snp_buf); + lsn_ = snapshot_->get_last_log_idx(); + } + + nuraft::ptr nuraft_snapshot() { return snapshot_; } + +private: + nuraft::ptr snapshot_; +}; + class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr, public std::enable_shared_from_this< RaftReplDev > { @@ -203,8 +235,13 @@ class RaftReplDev : public ReplDev, m_data_journal->purge_all_logs(); } + std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { + return std::make_shared(snp_ctx); + } + //////////////// Accessor/shortcut methods /////////////////////// - nuraft_mesg::repl_service_ctx* group_msg_service(); + nuraft_mesg::repl_service_ctx *group_msg_service(); + nuraft::raft_server* raft_server(); RaftReplDevMetrics& metrics() { return m_metrics; } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index f252dd209..abe966ffa 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -56,6 +56,10 @@ class SoloReplDev : public ReplDev { bool is_ready_for_traffic() const override { return true; } void purge() override {} + std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { + return nullptr; + } + uuid_t group_id() const override { return m_group_id; } repl_lsn_t get_last_commit_lsn() const override { return 0; } From 5924c1b832aeaae6586209ca2d62faf4b1e96df8 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Mon, 24 Feb 2025 23:43:53 +0800 Subject: [PATCH 068/130] fix dead lock in graceful shutdown (#652) m_rd_map_mtx will be lock when cp_flush repl_service, so we need to release it ASAP in case of other component triggers cp --- conanfile.py | 2 +- .../replication/service/raft_repl_service.cpp | 24 +++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index df765c4d2..780a96a40 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.22" + version = "6.6.23" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 6206c3dde..9857d4878 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -179,8 +179,28 @@ void RaftReplService::start() { } void RaftReplService::stop() { - stop_reaper_thread(); - GenericReplService::stop(); + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + + // stop all repl_devs + { + std::unique_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); + rdev->stop(); + } + } + + // this will stop and shutdown all the repl_dev and grpc server(data channel). + // for each raft_repl_dev: + // 1 Cancel snapshot requests if exist. + // 2 Terminate background commit thread. + // 3 Cancel all scheduler tasks. + // after m_msg_mgr is reset , no further data will hit data service and no futher log will hit log store. m_msg_mgr.reset(); hs()->logstore_service().stop(); } From 108a028ebb2443ed8c9b1678ea2e43908670f0a5 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 24 Feb 2025 15:23:12 +0800 Subject: [PATCH 069/130] Minor fixs 1. fix char* copy logic, add \0 at the end of array 2. update nuraft_mesg version which has https://github.com/eBay/nuraft_mesg/pull/117 --- conanfile.py | 2 +- src/lib/device/device.h | 8 ++++++-- src/lib/device/device_manager.cpp | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index 780a96a40..2744a38cb 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.23" + version = "6.6.24" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/device.h b/src/lib/device/device.h index beefdfc7f..1c3843534 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -36,6 +36,7 @@ VENUM(vdev_multi_pdev_opts_t, uint8_t, // Indicates the style of vdev when multi struct vdev_info { static constexpr size_t size = 512; static constexpr size_t user_private_size = 256; + static constexpr size_t max_name_len = 64; uint64_t vdev_size{0}; // 0: Size of the vdev uint32_t vdev_id{0}; // 8: Id for this vdev. It is unique per homestore instance @@ -48,7 +49,7 @@ struct vdev_info { uint8_t failed{0}; // 30: set to true if disk is replaced uint8_t hs_dev_type{0}; // 31: PDev dev type (as in fast or data) uint8_t multi_pdev_choice{0}; // 32: Choice when multiple pdevs are present (vdev_multi_pdev_opts_t) - char name[64]; // 33: Name of the vdev + char name[max_name_len]; // 33: Name of the vdev uint16_t checksum{0}; // 97: Checksum of this entire Block uint8_t alloc_type; // 98: Allocator type of this vdev uint8_t chunk_sel_type; // 99: Chunk Selector type of this vdev_id @@ -59,7 +60,10 @@ struct vdev_info { uint32_t get_vdev_id() const { return vdev_id; } uint64_t get_size() const { return vdev_size; } - void set_name(const std::string& n) { std::strncpy(charptr_cast(name), n.c_str(), 63); } + void set_name(const std::string& n) { + std::strncpy(charptr_cast(name), n.c_str(), max_name_len - 1); + name[max_name_len - 1] = '\0'; + } std::string get_name() const { return std::string{c_charptr_cast(name)}; } void set_allocated() { slot_allocated = s_cast< uint8_t >(0x01); }; diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index cac91237f..28eb37e33 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -99,7 +99,8 @@ void DeviceManager::format_devices() { ++m_first_blk_hdr.gen_number; m_first_blk_hdr.version = first_block_header::CURRENT_SUPERBLOCK_VERSION; std::strncpy(m_first_blk_hdr.product_name, first_block_header::PRODUCT_NAME, - first_block_header::s_product_name_size); + first_block_header::s_product_name_size - 1); + m_first_blk_hdr.product_name[first_block_header::s_product_name_size - 1] = '\0'; m_first_blk_hdr.num_pdevs = uint32_cast(m_dev_infos.size()); m_first_blk_hdr.max_vdevs = hs_super_blk::MAX_VDEVS_IN_SYSTEM; m_first_blk_hdr.max_system_chunks = hs_super_blk::MAX_CHUNKS_IN_SYSTEM; From 29b0c5f7af1ce8480cbae37be320bf43be292306 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Thu, 27 Feb 2025 11:50:28 +0800 Subject: [PATCH 070/130] Fix missing fields in RaftReplDev::save_state (#654) Newly added fields in nuraft::srv_state should also be persisted/loaded. Signed-off-by: Jilong Kou --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 2744a38cb..214bbafdf 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.24" + version = "6.6.25" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2d7da6c72..30e5682ab 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1199,7 +1199,10 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { void RaftReplDev::save_state(const nuraft::srv_state& state) { std::unique_lock lg{m_config_mtx}; - (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}}; + (*m_raft_config_sb)["state"] = nlohmann::json{ + {"term", state.get_term()}, {"voted_for", state.get_voted_for()}, + {"election_timer_allowed", state.is_election_timer_allowed()}, {"catching_up", state.is_catching_up()} + }; m_raft_config_sb.write(); RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); } @@ -1209,11 +1212,17 @@ nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { auto& js = *m_raft_config_sb; auto state = nuraft::cs_new< nuraft::srv_state >(); if (js["state"].empty()) { - js["state"] = nlohmann::json{{"term", state->get_term()}, {"voted_for", state->get_voted_for()}}; + js["state"] = nlohmann::json{ + {"term", state->get_term()}, {"voted_for", state->get_voted_for()}, + {"election_timer_allowed", state->is_election_timer_allowed()}, + {"catching_up", state->is_catching_up()} + }; } else { try { state->set_term(uint64_cast(js["state"]["term"])); state->set_voted_for(static_cast< int >(js["state"]["voted_for"])); + state->allow_election_timer(static_cast(js["state"]["election_timer_allowed"])); + state->set_catching_up(static_cast(js["state"]["catching_up"])); } catch (std::out_of_range const&) { LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id) } From 4ca6f5b8f9de550368029e81e5ced11d3f5454dc Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 12 Mar 2025 08:33:22 +0800 Subject: [PATCH 071/130] Add event callbacks into listener for upper layer (#657) some events need to be handled by upper layer. This PR add three event: 1 fetch_data: upper layer can decide which data to be returned 2 no_space_left: this error should be handled by upper layer if necessary 3 on_log_replay_done: after log replay is done and before joining raft group, upper layer might do something --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 24 ++++++- .../replication/repl_dev/raft_repl_dev.cpp | 63 +++++++++---------- src/lib/replication/repl_dev/raft_repl_dev.h | 18 +++--- .../replication/service/raft_repl_service.cpp | 25 +++++--- src/tests/test_raft_repl_dev.cpp | 26 ++++++-- 6 files changed, 100 insertions(+), 58 deletions(-) diff --git a/conanfile.py b/conanfile.py index 214bbafdf..270d02f61 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.25" + version = "6.7.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 6c1103a2a..93eca48c4 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace nuraft { @@ -367,6 +368,25 @@ class ReplDevListener { /// @brief Free up user-defined context inside the snapshot_obj that is allocated during read_snapshot_obj. virtual void free_user_snp_ctx(void*& user_snp_ctx) = 0; + /// @brief ask upper layer to decide which data should be returned. + // @param header - header of the log entry. + // @param blkid - original blkid of the log entry + // @param sgs - sgs to be filled with data + // @param lsn - lsn of the log entry + virtual folly::Future< std::error_code > on_fetch_data(const int64_t lsn, const sisl::blob& header, + const MultiBlkId& blkid, sisl::sg_list& sgs) { + // default implementation is reading by blkid directly + return data_service().async_read(blkid, sgs, sgs.size); + } + + /// @brief ask upper layer to handle no_space_left event + virtual folly::Future< std::error_code > on_no_space_left(uint32_t pdev_id, chunk_num_t chunk_id) { + return folly::makeFuture< std::error_code >(std::error_code{}); + } + + /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer + virtual void on_log_replay_done(const group_id_t& group_id){}; + private: std::weak_ptr< ReplDev > m_repl_dev; }; @@ -449,7 +469,7 @@ class ReplDev { /// @brief Clean up resources on this repl dev. virtual void purge() = 0; - virtual std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) = 0; + virtual std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) = 0; virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } @@ -460,6 +480,8 @@ class ReplDev { } } + virtual shared< ReplDevListener > get_listener() { return m_listener; } + protected: shared< ReplDevListener > m_listener; }; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 30e5682ab..d6f22a079 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -317,7 +317,15 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& handle_error(rreq, ReplServiceError::DATA_DUPLICATED); return; } + +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("disable_leader_push_data")) { + RD_LOGD("Simulating push data failure, so that all the follower will have to fetch data"); + } else + push_data_to_all_followers(rreq, data); +#else push_data_to_all_followers(rreq, data); +#endif COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); COUNTER_INCREMENT(m_metrics, outstanding_data_write_cnt, 1); @@ -775,29 +783,8 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ auto const& lsn = req->lsn(); auto const& originator = req->blkid_originator(); auto const& remote_blkid = req->remote_blkid(); - - // Edit this check if in the future we want to fetch from non-originator; - if (originator != server_id()) { - auto const error_msg = fmt::format("Did not expect to receive fetch data from " - "remote when I am not the originator of this request, originator={}, my_server_id={}" - , originator, server_id()); - RD_LOGW("{}", error_msg); - auto status = ::grpc::Status(::grpc::INVALID_ARGUMENT, error_msg); - rpc_data->set_status(status); - rpc_data->send_response(); - return; - } - - // fetch data based on the remote_blkid - // We are the originator of the blkid, read data locally; MultiBlkId local_blkid; - - // convert remote_blkid serialized data to local blkid local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); - - RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn, - local_blkid.to_string()); - // prepare the sgs data buffer to read into; auto const total_size = local_blkid.blk_count() * get_blk_size(); sisl::sg_list sgs; @@ -807,7 +794,18 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ // accumulate the sgs for later use (send back to the requester)); sgs_vec.push_back(sgs); - futs.emplace_back(async_read(local_blkid, sgs, total_size)); + + if (originator != server_id()) { + RD_LOGD("non-originator FetchData received: dsn={} lsn={} originator={}, my_server_id={}", req->dsn(), lsn, + originator, server_id()); + } else { + RD_LOGD("Data Channel: FetchData received: dsn={} lsn={}", req->dsn(), lsn); + } + + auto const& header = req->user_header(); + sisl::blob user_header = sisl::blob{header->Data(), header->size()}; + RD_LOGD("Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string()); + futs.emplace_back(std::move(m_listener->on_fetch_data(lsn, user_header, local_blkid, sgs))); } folly::collectAllUnsafe(futs).thenValue( @@ -1199,10 +1197,10 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { void RaftReplDev::save_state(const nuraft::srv_state& state) { std::unique_lock lg{m_config_mtx}; - (*m_raft_config_sb)["state"] = nlohmann::json{ - {"term", state.get_term()}, {"voted_for", state.get_voted_for()}, - {"election_timer_allowed", state.is_election_timer_allowed()}, {"catching_up", state.is_catching_up()} - }; + (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, + {"voted_for", state.get_voted_for()}, + {"election_timer_allowed", state.is_election_timer_allowed()}, + {"catching_up", state.is_catching_up()}}; m_raft_config_sb.write(); RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); } @@ -1212,17 +1210,16 @@ nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { auto& js = *m_raft_config_sb; auto state = nuraft::cs_new< nuraft::srv_state >(); if (js["state"].empty()) { - js["state"] = nlohmann::json{ - {"term", state->get_term()}, {"voted_for", state->get_voted_for()}, - {"election_timer_allowed", state->is_election_timer_allowed()}, - {"catching_up", state->is_catching_up()} - }; + js["state"] = nlohmann::json{{"term", state->get_term()}, + {"voted_for", state->get_voted_for()}, + {"election_timer_allowed", state->is_election_timer_allowed()}, + {"catching_up", state->is_catching_up()}}; } else { try { state->set_term(uint64_cast(js["state"]["term"])); state->set_voted_for(static_cast< int >(js["state"]["voted_for"])); - state->allow_election_timer(static_cast(js["state"]["election_timer_allowed"])); - state->set_catching_up(static_cast(js["state"]["catching_up"])); + state->allow_election_timer(static_cast< bool >(js["state"]["election_timer_allowed"])); + state->set_catching_up(static_cast< bool >(js["state"]["catching_up"])); } catch (std::out_of_range const&) { LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id) } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 1d2e4ea55..01f5b1926 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -115,22 +115,22 @@ struct ReplDevCPContext { class nuraft_snapshot_context : public snapshot_context { public: - nuraft_snapshot_context(nuraft::snapshot &snp) : snapshot_context(snp.get_last_log_idx()) { + nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) { auto snp_buf = snp.serialize(); snapshot_ = nuraft::snapshot::deserialize(*snp_buf); } - nuraft_snapshot_context(sisl::io_blob_safe const &snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); } + nuraft_snapshot_context(sisl::io_blob_safe const& snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); } sisl::io_blob_safe serialize() override { // Dump the context from nuraft buffer to the io blob. auto snp_buf = snapshot_->serialize(); - sisl::io_blob_safe blob{s_cast(snp_buf->size())}; + sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())}; std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); return blob; } - void deserialize(const sisl::io_blob_safe &snp_ctx) { + void deserialize(const sisl::io_blob_safe& snp_ctx) { // Load the context from the io blob to nuraft buffer. auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); snp_buf->put_raw(snp_ctx.cbytes(), snp_ctx.size()); @@ -139,10 +139,10 @@ class nuraft_snapshot_context : public snapshot_context { lsn_ = snapshot_->get_last_log_idx(); } - nuraft::ptr nuraft_snapshot() { return snapshot_; } + nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; } private: - nuraft::ptr snapshot_; + nuraft::ptr< nuraft::snapshot > snapshot_; }; class RaftReplDev : public ReplDev, @@ -235,12 +235,12 @@ class RaftReplDev : public ReplDev, m_data_journal->purge_all_logs(); } - std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { - return std::make_shared(snp_ctx); + std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { + return std::make_shared< nuraft_snapshot_context >(snp_ctx); } //////////////// Accessor/shortcut methods /////////////////////// - nuraft_mesg::repl_service_ctx *group_msg_service(); + nuraft_mesg::repl_service_ctx* group_msg_service(); nuraft::raft_server* raft_server(); RaftReplDevMetrics& metrics() { return m_metrics; } diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 9857d4878..d4b6d962a 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -156,16 +156,21 @@ void RaftReplService::start() { LOGINFO("Starting DataService"); hs()->data_service().start(); - // Step 6: Iterate all the repl dev and ask each one of the join the raft group. - for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { - auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); - rdev->wait_for_logstore_ready(); - if (!rdev->join_group()) { - HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); - it = m_rd_map.erase(it); - } else { - ++it; - } + // Step 6: Iterate all the repl devs and ask each one of them to join the raft group concurrently. + std::vector< std::future< bool > > join_group_futures; + for (const auto& [_, repl_dev] : m_rd_map) { + join_group_futures.emplace_back(std::async(std::launch::async, [&repl_dev]() { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev); + rdev->wait_for_logstore_ready(); + + // upper layer can register a callback to be notified when log replay is done. + if (auto listener = rdev->get_listener(); listener) listener->on_log_replay_done(rdev->group_id()); + return rdev->join_group(); + })); + } + + for (auto& future : join_group_futures) { + if (!future.get()) HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); } // Step 7: Register to CPManager to ensure we can flush the superblk. diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index c419e6b1d..cdcfa9b1e 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -33,8 +33,8 @@ TEST_F(RaftReplDevTest, Write_Duplicated_Data) { stored_key = dbs_[0]->inmem_db_.cbegin()->first; ASSERT_EQ(id, stored_key.id_); } else { - LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", - boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); + LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", boost::uuids::to_string(leader_uuid), + boost::uuids::to_string(g_helper->my_replica_id())); } wait_for_commits(total_writes); @@ -45,12 +45,12 @@ TEST_F(RaftReplDevTest, Write_Duplicated_Data) { if duplication found in leader proposal, reject it; if duplication found in the followers, skip it. */ - //1. write the same data again on leader, should fail + // 1. write the same data again on leader, should fail if (leader_uuid == g_helper->my_replica_id()) { auto err = this->write_with_id(id, true /* wait_for_commit */); ASSERT_EQ(ReplServiceError::DATA_DUPLICATED, err); - //2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side) + // 2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side) dbs_[0]->inmem_db_.erase(stored_key); LOGINFO("data with id={} has been deleted from db", id); err = this->write_with_id(id, true /* wait_for_commit */); @@ -109,6 +109,24 @@ TEST_F(RaftReplDevTest, Follower_Fetch_OnActive_ReplicaGroup) { g_helper->sync_for_cleanup_start(); } + +TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { + g_helper->set_basic_flip("disable_leader_push_data"); + LOGINFO("Homestore replica={} setup completed, all the push_data from leader are disabled", + g_helper->replica_num()); + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + this->write_on_leader(100, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + + g_helper->sync_for_cleanup_start(); +} + #endif // do some io before restart; From b85d5a7b7316a4a694c3eb2248640d7638dff8f3 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Wed, 12 Mar 2025 14:16:02 +0800 Subject: [PATCH 072/130] Destroy upper resources after Raft server shutdown (#658) Previously, upper layer resources were destroyed when a member was removed from the cluster, while resources on the repl dev were garbage collected in the reaper thread. During this period, the commit thread could still be active, potentially leading to new commits accessing already destroyed resources. This change moves the destroy of upper layer resources into the garbage collection thread. The steps are now: 1. Shutdown Raft server 2. Destroy upper resources 3. Destroy other resources on the repl dev authored-by: yawzhang --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 6 +-- .../replication/service/raft_repl_service.cpp | 39 ++++++++++++------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/conanfile.py b/conanfile.py index 270d02f61..cd63e3272 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.0" + version = "6.7.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index d6f22a079..bb509b47d 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1245,6 +1245,8 @@ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { retu void RaftReplDev::permanent_destroy() { RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); + // let the listener know at first, so that they can cleanup persistent structures before raft repl dev is destroyed + m_listener->on_destroy(group_id()); m_raft_config_sb.destroy(); m_data_journal->remove_store(); logstore_service().destroy_log_dev(m_data_journal->logdev_id()); @@ -1272,10 +1274,6 @@ void RaftReplDev::leave() { m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYED; }); m_destroyed_time = Clock::now(); - // We let the listener know right away, so that they can cleanup persistent structures soonest. This will - // reduce the time window of leaked resources if any - m_listener->on_destroy(group_id()); - // Persist that destroy pending in superblk, so that in case of crash before cleanup of resources, it can be done // post restart. m_rd_sb->destroy_pending = 0x1; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index d4b6d962a..0884272b1 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -528,21 +528,34 @@ void RaftReplService::gc_repl_reqs() { } void RaftReplService::gc_repl_devs() { - std::unique_lock lg(m_rd_map_mtx); - for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { - auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); - if (rdev->is_destroy_pending() && - (get_elapsed_time_sec(rdev->destroyed_time()) >= - HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) { - LOGINFOMOD(replication, - "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now", - rdev->group_id()); - m_msg_mgr->leave_group(rdev->group_id()); - it = m_rd_map.erase(it); - } else { - ++it; + incr_pending_request_num(); + std::vector< group_id_t > groups_to_leave; + { + std::shared_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); + if (rdev->is_destroy_pending() && + (get_elapsed_time_sec(rdev->destroyed_time()) >= + HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) { + LOGINFOMOD(replication, + "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now", + rdev->group_id()); + groups_to_leave.push_back(rdev->group_id()); + } + } + } + + // Call leave_group to shut down the raft server and destroy all resources on the repl dev. + // This operation may require acquiring the m_rd_map_mtx lock for some steps (e.g., trigger cp flush). + // Therefore, we perform it outside the lock scope and then remove group from m_rd_map. + for (const auto& group_id : groups_to_leave) { + m_msg_mgr->leave_group(group_id); + { + std::unique_lock lg(m_rd_map_mtx); + m_rd_map.erase(group_id); } } + decr_pending_request_num(); } void RaftReplService::flush_durable_commit_lsn() { From 6201f99e64409ac954119a21a27e143689d11ae0 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 13 Mar 2025 14:47:38 +0800 Subject: [PATCH 073/130] Fixes on gc repl devs. 1. skip gc repl devs when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops. 2. skip flush ops on repl dev if repl dev is destroyed --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 10 ++++++++++ src/lib/replication/service/raft_repl_service.cpp | 7 +++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index cd63e3272..55d6b71c3 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.1" + version = "6.7.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index bb509b47d..2aede4cdd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1371,6 +1371,11 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } void RaftReplDev::flush_durable_commit_lsn() { + if (is_destroyed()) { + RD_LOGI("Raft repl dev is destroyed, ignore flush durable commmit lsn"); + return; + } + auto const lsn = m_commit_upto_lsn.load(); std::unique_lock lg{m_sb_mtx}; m_rd_sb->durable_commit_lsn = lsn; @@ -1379,6 +1384,11 @@ void RaftReplDev::flush_durable_commit_lsn() { /////////////////////////////////// Private metohds //////////////////////////////////// void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { + if (is_destroyed()) { + RD_LOGI("Raft repl dev is destroyed, ignore cp flush"); + return; + } + auto const lsn = ctx->cp_lsn; auto const clsn = ctx->compacted_to_lsn; auto const dsn = ctx->last_applied_dsn; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 0884272b1..2b355cebd 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -529,6 +529,13 @@ void RaftReplService::gc_repl_reqs() { void RaftReplService::gc_repl_devs() { incr_pending_request_num(); + // Skip gc when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops. + if (is_stopping()) { + LOGINFOMOD(replication, "ReplSvc is stopping, skipping GC"); + decr_pending_request_num(); + return; + } + std::vector< group_id_t > groups_to_leave; { std::shared_lock lg(m_rd_map_mtx); From e3d35d6b771a08d7de2689b2b30d001a61fe9420 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 17 Mar 2025 10:03:37 +0800 Subject: [PATCH 074/130] Send response if failed to save pushed_data The previous implementation missed sending a response when storing pushed_data failed. As a result, a large amount of RPC data was held in memory and only released when the connection timed out, leading to increased memory usage. This change fixes the issue. The issue occurs in test case `full pg recovery in 5 replicas env, blob_size=512KB`, when a batch of old shards sealed and a batch of new shards created, a lot of 'pushed_data' failed to save due to shards not committed. Then the memory increased and the pod is OOM in the test. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 55d6b71c3..dc530b2f4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.2" + version = "6.7.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2aede4cdd..71389cbae 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -434,6 +434,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d LOGINFO("Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " "server_id={}, term={}, dsn={}", push_req->issuer_replica_id(), push_req->raft_term(), push_req->dsn()); + rpc_data->send_response(); return; } #endif @@ -445,11 +446,13 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " "trigger a fetch explicitly if needed. rkey={}", rkey.to_string()); + rpc_data->send_response(); return; } if (!rreq->save_pushed_data(rpc_data, incoming_buf.cbytes() + fb_size, push_req->data_size())) { RD_LOGD("Data Channel: Data already received for rreq=[{}], ignoring this data", rreq->to_string()); + rpc_data->send_response(); return; } From 8f5a92f7ad91f3fb7579bbc65077c1998fd496cd Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Wed, 19 Mar 2025 09:44:06 -0700 Subject: [PATCH 075/130] issue: 667 Enable on_repl_devs_init_completed callback on SoloReplService (#668) --- src/lib/replication/service/generic_repl_svc.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index f5671cb16..067043185 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -87,6 +87,9 @@ void SoloReplService::start() { } m_sb_bufs.clear(); + LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); + m_repl_app->on_repl_devs_init_completed(); + hs()->data_service().start(); hs()->logstore_service().start(hs()->is_first_time_boot()); From 22483e5f3abcd077b1672b16f6efd7d4b72b616f Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Thu, 20 Mar 2025 18:04:01 -0700 Subject: [PATCH 076/130] Add Disk Layout Diagram for device layer --- conanfile.py | 2 +- docs/imgs/HomeStore_Disk_Layout2.png | Bin 0 -> 345352 bytes src/lib/device/README.md | 7 +++++++ 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 docs/imgs/HomeStore_Disk_Layout2.png create mode 100644 src/lib/device/README.md diff --git a/conanfile.py b/conanfile.py index dc530b2f4..6a6b60aa4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.3" + version = "6.7.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/docs/imgs/HomeStore_Disk_Layout2.png b/docs/imgs/HomeStore_Disk_Layout2.png new file mode 100644 index 0000000000000000000000000000000000000000..8775927ee41926ca4ebdfa97c5cb34afbf3c7bd1 GIT binary patch literal 345352 zcmb@u2Uru^`Ys+ih=PEENZk;MB3*hH5D)nrc~u7p3*RJn5L<|PmabVW^7Ne2WX zfrCHA_DMm6csD^rxJ!VKEFQz( z`}gs#k9cnw?!QqWaYQv1fu1@eel$Dz6#_S&J zyXtE^l(cY!@;$S3d~U_(33bBV2PEw&3G70xT%WOeLLm+?NlzKJiz_67ecWMwHr9(v zT^l z&(Xz(Ur<6qf?q(0Ur2}-xPlkv<>30vlh*;p{^w2px{s0-%)-Ua$<@x$ffaY(XU`qo zTxHnUa1Z+X@6Uc(dD{K!Ne-~zuLZmyKkf{_AfEvL-}eTZO5=`7KC<()f*2^-K>?Wo z&ybZ65R$&Q{(n33uP6SarT)KK3Q7oz|Fh{oPW@-oCon4)MMo&`Ojp@|jo0tS|2+A- zp)^14)&C)iKjnOJ6cDuRC29V@2Tk^pCO%pj7)M4sC9TK67SK4{AKq8sANQYIU>`r1 zg2ti)&ErR36BP$uFaN)O*=xN7NzZ@${p2_Rd%QDu zzCG7q5hGEEv0ER*p6_CJP9`cWI)^)x2wp#5G{YX6HDWOc?3VlXdzZ z6IX4Iz0V~4eh>+BZqXL(+3srXOg+rz*5q&*l^g0arT^d%lu~jnr=Yc!@#?chv-4#$ zmnmkq@7mwCfGd{6$`$0eYHZP3*aNMZTIf^*dyM#VJEk7IL;?m9W>0g}$Z(lr#joaF zzp|+oKOc{ig;0KzP?xp40|}#({O>Dbu_v+hCX_MSX$M{KO z>uF=tk@(K1#iz^(>>HnNhDp3t)`FCuKf7Zra<)Bp4(*hl65>}%wBQy!UvWI=EE}N@ zeyUNtdc%hgmfBnVB9|c|y7hdsbw^y!5N)SqqE93Ay5hBgi1PT!fJe(nrsvOa4Jdu9khPfz>=kWDu^QN5x4YfLbeTLPzN^YyDNazO$(3YmXeoheZ{S}?*`1WkG z&yNq5a(H8WykoK_@vqGZSSZF^xH6o+aF|k}CC?L*>;*7aqFxotJ=LIEz7cT+&Ao`O z=wxpEz&L>rrfJvy0viq9>=Ju+aJt?>$r z@VtXO#R@=1+t2?shJfmB^j#tDuJL2{9is9-d>T-z-~t^H0p9&h$hSWtt1_iLHsq2# zJC(bX=}f*e`R;vIXDnce9+zeR(gBR4@#T05$u-vNKH(mKK!I0S2o+jTPGtXbf3rQZ z0T|^@KzD&T@`T*wV#@tFJ^qy3ze~Ao)p_9+ojYK#Mg#8rqR1v0V+v*VUZYl7R|$Ro z6&Sv6z%VD(xmJQO5Y>%#eW$M{| z`_K~?S`114%KcbZ%hzm=e)0dpG8vos*l+$zTX6P~9bN-v$8-VH0T}rUk7IE(0Ei&J zD>V%Io*i-nY~L7VhZ6^-o z-a!7fPm3`|6U1(Iyq%9AgQZcMfOFXD@xE-O*AwOfbXd$hCa;Xe?_dZW%Q36GRZjmb z*t;5CWv~m|aq(+>k;_N7S@Q`|0{ zpq**u^}*pf&8IK6;n)FhTZa4c=D-=h&Tpc0$>QCHpe+tPJYI>ITMm~XL2WO7!Ou$9 zJ}c9capBt;1a>&NgPb(z7b2W||!iLAni$Jvtb9iaS2gVzwz#Ks4(;6G52RSygL zojO4I+=v@AlgZU@WDzE3D$HJ%5|-J`LY>Z;S5y~5VX^xoDm~G5)1*&2>a8XzjkJ7! z(lnO&F7r&R@H%?+?!2Y+Tl*qg9>HvT9z^MzBZce}hSlF%B{$uB`9@*iI@-xOTEmxe z)iE;9p{l_cey$}|hrg|^<0r<$TBg{9d-DGCCR+ zID~tx0e?5B+-dw1%G3C-L9AS z7vIm)qPOF1cCSl$Jv5cSzY0%|#QOOiUmqaJP>4~@%DQSX`7#*U;Op3H1;_BXx7{~;8vO-GGkk6 zE@MRPEzVZ|Zpf-@Y%I89UIO8zNqp>kiR64F`~2XFN9BzNO8dH5`-zNK`rIC^^JjR- zTbA0qBOjwts@tBZDw5(*zJ}}(q044>PH1CWQ=9Yop2z&7`RDbru2pav<|qI8&#h0s zPH1H_BMvtD(_R1LM&S%=Og*aKaklxWrfi$fXQG#OM7>#u0*%fkH7@-)i_AJ++WpGC z*-RpL@_FV>sVfI6CK}R3BIc>sd_!)~>&G=*9RI!GIzB;%c&DcHuXYhXn~&&6F;7n< zuQ6m4(V2QatMfVDDR~;uzq*Dn_NZ>_csp*s_0!P7Oa-j6reM~Y{heC+t9Kh!cTMjI zf_{uCUiwFf^HmK)Ojncwrh2Qq+e@%l)BgKyyEwL<`K*= zZGI~b6J;aK9(;}MMz{dsXkwV^*=Wr!8>>@DbR;>LV1D*jjEC#4@2XT|>q%s5mjzdh z2z44>u&3EPE9l^ZUh@l&#;KXo#@EXT)ZVh_(ERZtOKX z-$zM8Wa2PuDsl~{?Od|)r*kHNx?N5p59FIuW@!coF3Ft)%XL9}Ev`7Y<}tXHdH?u$ z8LrM=ik!rUuO~pF&nFQ6qH|5Z^5oEYGlyjXXKpQai*l4i_$rCPf}KF{P=^Zv8IeG| z{GsG_;q~UDF^38h zjl`wtuXbH-U&mUGYIbl~fK=`q=O&u|`_(7wMcK1V0hnLZ>ne$)VxXplD~?@lQQP6+ z`*&5dj=UJ59Biqi7~ui&ywl!ROs}uZ%f$^iDICwZO`FYUO{LAWltVwgJjNQ@mU&#P zh}SD`l{1^U<65-CDOTV8h1(n?*hqdb@}|t!YTX`#V688El83lJ2 zG~mJ>NB(af8d_?WkHB9)XXky*xvu#=>BB>Ko(24--8IqS5HYn(_v|qa-f6CNLdb2E zkZ4@9Dx2(N_FTTTi4S)8MG1G;Mb;JQ-)X7e1keg%zkCiscY)W{`ESQn2ZqQjUZ~#$ z@Snw$B13AJdc(UHg7K@9qb^F{IlkO~S=0~X@aF6^ODI$;I$o@#+!y?X}%|4S!VZrKTjW|jWt=r zYu{yn<3#FzlsJf>FspK{kBurgAA*YCL)jr20q3axGuJ$qN1as&Ss#~;_eGk+ulB%D z8ELgCQZ3n|ir8fl^C6H%Y8id+>~YJn#;|)+LiIO`Q8B2SjM`Q2eu;j`g+D00y1)wG zx&doG*FDyHerT?u*6YD&4^ud*nJ_P+7DScmN9pJ zRF}Rv+)%6BWri*=O9KmJoDx8@9PXw(;`irX)N4im=*G>*?%lK5U5()#P{Oq*(rRMY zmt;Gg*I&QtUSz?ZU{FsOUW+a_CTScv_qYKF)ABy%1i;#s(&sQMmDbkTMT`fr%HgP= z9s$Q5NDu8@LIjr?dN_I1*Oi^YUbbm~D7j>}*5Yw4jsy z;W=c~tTF5{- zF*sq2wf-m;XLs+S?9?ux2P(iCqyx}HSh(0lvMhwN82^l`J6rI(+bH~<_K{zeS|di;{>`FyGzUa@|D+#2w@S>7kmF(Sf* zV5TCd1P%%YVirZNb^0iJ@4|OWm+{mMJiiS<9et+MA{sKM0T-B_Z@U7i^DP|TXidT% z)^K~B*HNkRbtdU;CHc;E_?poAm14aheMQQmD{@s6K9G4|PgJj2*5~IDJ6ZW%dzYIG zTFjCh9PHPYx+22$f4Qk^1^QQcJ4s{8C>1#K_-3cVDK@d-)6CfGQ8aUP|AwBt3yP(z z7u++nv4u+}Eumh%YPg))mk9VzSy~1-xWCZ}5@gL^w%p;NzuO8ikbAaJwXFRK*%%p( zaU(_Qt3h2xk=v6s_Wjsebf_y8yzY(#dDWAVN059X`Pr3jglbl;Wy7NZC=f>#dE<`V zcN{`a)lMFdfD@qAkXtg>80_%k>Nun^dtd7e&qT7->E2HePqE<8R`%!QF>k+f@PZna z+6Ux5)~XwVmP&YOEcJi1n`b#Pz74IbEuCSNc)mZ$t}I)MnMPNO_aP=9&VRgVD%5#% zEhm^_!w?EU%#ou{bX~qLtEg7@k$*3!VN|Sec6Nkj^7&aCi6SB9wJ)vCz8d%XX6IgO zL4%A?mCBCz*HEb$s|CC{BheyWGRoODy|!6HYCgXv zmYQ~2Fw6X_HVVilqmFb%N}1}@1(x%Lrn46>DZEbCl+O|?bq5@M7a36X9k*@@%ZPM_st=WtpW5MnnPF!^b?qp!pdZazr`jCgf~>yf&11W}x26P=TXD4m|5=Lk z-RF^;+35G#8>K!oo9--UYbvn%lj(%4gZv$wBk`zfoGEGi#_JGYCsa5KgjoG)Nw82M zn5i!F4`k9x^^|q^>S)WB760ZeW<_k(J!cMSyq0}hpPkP1K<0dp z5-U(g7Lf0|PW;NiXirR5QOX~)$bwkqUnCuK1EVjm%131E`mcrht4_vab*;0OGaaJV z#A8G~x9!au%g=$Zh{u;M{9N9w3H_riE1PCuj+=b0+19V`*APN=<@AQwIBg~@S}>~R z4ai}K#P$KUfCUz9gm%L3XEKu24WJIN=`)YIZKrMW$zfIeT;+LlSABN{_HEu>e+qL;=kDfgYN2?5M7_*$2pf(>Cv8Z->izSEbJsH_aTp*|o@c4$&96@{M>KG2K4i{32H_kKdy2 zgB`c%+L&>#;M3x?*E)0qegk zYYbE#ngHC^_h$W`lB?-aGyn++?mRE@Gty~Lh0gwTC&j^s%|K@Gq4OJkdr-_+#*$0hC)edRKajLwLJ$$c~ zSo4^hJ)K;u$6yHwm->-cmbD5YHuzx)Tym&xuhZ$JY9n3u+}Dn<^r@x& ziUJ|3ipY-JD?Tk%*)4mo_#X~P?KUyG$k&UK0A zDk1*iyt4kM`(&;fJFx82rAqBOLUk^jy@DyZxI9?m?MRIkdjMHB-q4Zl^!-{pe(;B@ z%||=7z*V`<)dT|fef!oUdoO#Ktj|}5S%Kqv^eM?lao=UQ<+!o$DwclS=4#x%@>IH~ooB47El}rbS?{OJXai>T5)fhNHO?isa7L>-{0Ivn(X<0$hwWfpqNS zvh1&JEq|9nrcO>!`l)||vpGBvxg_SlD29I0oxMHws=;|6KV8FR?q`Z#)6eTwNc{Bm zj*uolRP5P!>~bf$6CaE@xq^2lN4kFFNm<&|T^R;ncL?fv2b|CkhE-mannMksq)wN) zvzJe)i?cB8S{NlhN_U#20sC=64ARr(!K7&BQIX?YqFo6wYJu#g9k()`!rFreW}nRl zfdD<&O%@4NzlS9pGkmyo#PagWin|JUtgu~gkR^08pf<~*|FfG4E({-4?knL1vfC;N zKjBlw3Wdg6bUgdU7q6)gw?t`6e8lV{&$0Ah_lRop!(1+ZBSs$TW4emfZcrM*foUV7 z0u)_lCk+q7_CD63tLl*!NdhY+B{l%Yym?!B06?%=Eet^b)-`n#xX@U-e9u9RipEq) zf(@5#t=YyEH+TP4^uu9m#u=S;_>w2#;oEUC_9-p#S zzZ)1Q>?4%@HF52_l$v(5{XB*J7nIO724q>_{lod5B3q8owLB*O^VI-@&a#F+lu{tg1JTSM2sJ_7zcQ%Au zmm~JbQ7qW9B{ZGcf1fKrsO}bPCIhY0-dDg`_q=hq3&HD7j9lfc(vVyw z5wpth*{QF}%@t!!#r{f_RV~!2v_c0nALOV-==UGyv}9;WhtiXFd-Z8tJ|IWquZqgo zfVUip;i$`E0_K$5JrX@A57UPD=d#+I=+0b1&q)04L^9i6T4dDqjMhNQfifuoeW*Wg zgLMc@(rpidm4grw_p|f12VV&Wrt6^x=1>iTPjCH{QPdxJlJ%>vUNu*GdismT<)}~$ zOXer4VCV4N6|X~S-R6wm1MRKDPTJAMUGK5R{b=qemh>RY4v`LnNwMuV0Wa+7T=uE= zvKbx259`?IzoU<4808^ld3g~?ypvbuT4z5}O zl$wul^-ElR`=UfeMFhvQ348{+w1=-Bo=^uyFW|B=dTv5inrL$U#Q@MvwzJv2*J4Xy zW~}4_^s4vhPv`RjEptfbli!KYYffs4s65!u(8*22y9HwCA#5TlpI+DWNW+Xeo{i=! z9pir-7X3`$zE{6#yegT1!%Ok;oabeZ>k<8j1Tmnza8+<65-=f}d@OMpszy1~gT#Y` z0?$h~l9@t<%_%^=$9RT=L`?cdNCW>QyduWl<7}^$RN+Qx9~6+Tnh|GUV!7{421nZD zDurKLIt>q77KaEpxDW^+EI4(N-3foa$byBa?tuX0!V@VhSTnwObkAcOgJ{f=Y&G?wE}omN-E`PKqF*7c0Mmc?^p!|xIM z<8=%Lx2gnL2GSu|iFs!Yq2CDbMoSOt%~f;V3yO zr1Wa~;w=`#A$~fJyL`2)SS_zSd7&|t44!FE1-V=~!PkPL&JIrJAl>lqt4;S<%%$S7 z#Yok_h3X&f%!=hf{&qYk1{JHDU!3j8k-mI`j%dlBdY(WuEyhE+$SOwa#Ef@pN9hpw z%$U$snC~suP(P*>+>u!gJqJS-vm9F{E~&0qCwRp@tQN7jV!^-2ZJwS<9d>(l7bCF4 zm#{_-WJ!W(;q6(R754`Mok@6KeDNzo9l}9oTspN)_PeV4`KIk1|0PnhYM=lR@VhYn zCj;S6MZOGIj`4lgr8W&`3v|a-VQ>XEp#2XVQ-RPoI%SWohw7pPZvq^(z$IsYmo;VL z`a}y3m^zaB{Rh;wUxM1A<%RY_-rb$YXQeft_6Q>&4v{=3W1R z^bh(IBI7K7^tuF^j_Z=BY7@S25PP6{4)<01^6s>f>_7%cHhBs$wYF%-5ghbV34<>d zE=pMZR*i*v-(ZV6NIg;5ili_RY8J_Big&xN^=w=CJKhprde}AJb7XdQ-f5t;(rzx% z;k?gE>qo~}>1+aGTKOxg#BA4PdRBvas|l`>rjTFW8LRQ7u-V6%qRaP(DCHk89OXfV z$gWZduveyaJ)ivGNNM{SP5bFd?v3!DgfYCs<$GZ6(1CsoxCzjQGTrL-jb~9e1PBHA8ZYQh}68abOy08g{O&h=@Gyx*j=_{LO+kv8gF>-> zi$8{bW*qy*w2KGdhbT0^3A1$g@(F;SGJD%U$%nwy63GQ-NxJR{I5K#UR{U=BZo+YWziid&-;qCpG~1w_ zQVKys#X`xQ$k81Y~YG*jmP1Ry^KeQ4PY_vzrY$@3! z={rR~_+U@`vt)l2+sN<`NNs7~KYywNR_);QTXgVfG}?g$N(3_bk;Zk^w_hQ~oNZMh zvVx{`lB`UP6ohV0Z<-$WoW-=rV^d~|T4ZJIykw3WEq_8tTE69lTC4__(gm?ry~@gio0 z6R$%XBv97GAAYU@78dh!a&S&i^xA>7@_{b1t=In|=Dp!FULVx%Ml=ayClL+&k)fO4 z5{(HxAou%Z5Qq$Csr}A3HlG-p?%_FlXw8}`>0&`FHv33SUP#P4(K60BQU4V1;%QDy zwtsnX3t1d3>~BICP++g(d?pP6aHbDgzo|k1AM8*3pYsC$?~2KAq56oOKsS=+FbNtc zkKY)<5A1Gr!xJbZqvTy<=j{eK3*P@HpI$)IK{hO4n)rty{GA6-^K|a}J@~eNT%z1FFgL(9re+7-Q6eTlkE81Y zZ>^Z-kXx-5$wSTiz2zVro2aCIHDjU(vzWjS1e0Q794rh^GZ0%mAqI11Tqk*$6nYM8c4Z^%z zfovk(ZJ`iUtHtG#d>*wE>-{}1&GC){QFpO=y+JVY&5?!ui3?5Jeb&?#6{F!g3B7Tl zR8wt?LCIl3UrQMW+I2 zULP-qwi5J8ka))IX<)_hQ&vf{fe`hZl5CdEi>h8cNz>)qda25XN1OM2oX&@AJc)&* z{s6J7VPc`Bb_8t#2<9TFsviT2a-GhC(Jzzfx;UL@jI>c)!U;>G*dCJ+UUwTL%SU4( zXqvd&phF`7$Vij`9U-x}w0W4+KCavG>&Z5d)z|}sp2>)6dwY3mi$|$kAC{M0GU}*A zMGbu%3UJpC17_^cq(`ChgBWM4qy-63yZ~3}0L_4ruML1W zIf2AS00p4Y9d{SrDsP4WRuggE<)Y%p7iOpv0$8-@G2?|R?IhV?biA_^4B6n}`%9@^ zZSTDp0^w7$N3{}ZvAnP5u;<78jL1}yFd@Q}eLp3epCPPx8i7`cOtud~ZpD$Z*3VLE zhuD%XJ?niTTgUu(aWXJWJ@*4%@v$aV8xBK8BJJO^$GvN_+Q(Hegu~1Ti^K@JmF_%e zh$Ad!&vmJi*b^!9(m21f4R~4h@Ke^*Bd!fsZJGCSkK%ThC;Ac-j*PYB&If9nAyk?( z6D@Q%8oS&b^4Fy#yJZf6}SA49mr8+e)e$jH9ZCsQ|3uR*z)f@oi?8CYc8aXM*tY1M5f*msl z0YEdQJ*@vveFyz##?L}QAjvVZ0zO+M!n;QENcw@(7G`5%p`#(tOapm3Td00Pj z4{n0No1ngvc37oa_36Z|ds%DNE%tdfU|l|QiPbj>E@lh;NI-$87DE7xiZde=7iFz* z|1(wi&EMpODSl+}!eQXAakSW)*gq*9K!xH@`vs7adg6BrNoZ^HapenQ8fKu1^1GO9 z;G^3f(s|EPn5)5mbJF92*!jQ;$Yw15NuQz(TCz9UG?ZYwcD3;h3)@SeV4IAX-;q|e zHqw(PWuvZk(bAt7eD!Fa*)FwoF_Ir%57;yJHu&hW=q@Cs|c95BVp8&BiM z1D z_jMz{b*2ZyOkLQOhv@V_F{>6%N6{;NO)BXbq99envU+DQu!WstzZc1hJlmEm8;&paS@9y6P zo4teIo#Anj**dRL3h-$ke}9G5bwE=rQg08nqSDU$HEw74WCX974715_i*0|tawRbw z_ECwtDrWgL6?)B@m40eVz&W{b+R8fAIUn58iBRQxvBm@KsULx_Ax;tuhacf6Q9M6! zyy-LnJ|G*$@h<-+GzW|5Y#K`SaLrzl0cdd-Jj&`9fQwyKzhE%`oj*_jl+>@RUYfCj z8lD%_;ceFcqlaWxU8!#}Y91mcA!f~0B0~gQLhy`0;SgaApg>O1+g_xXGeB*iB+z3& zQe$}`ECD)c`+NloWRcQQJ0AmsTPfjq5kz1D&7jUU<`&AF%aDn+P(&hvicH;=H_$3I zq)?tAV*Z%P#|CFjqH|+$#U-lIV##@0S-s z=MYRG2!iVEUT!Db8MSd963xR1KvQ3`VFMB_Er2x8#E9{gjJMfuwp)8)!Rqy+J_XVl z{E^gtoR`4D%+zju6hwiro}Q}6`p+R15rR86$3ETn=aX{#IlU?zDep3LH3ebw$zTzv zBO2}ae1BE3l^lRdhE44;p5QnbIx^_1+o7`{d$hA!-7L0V()Jf;EiNJJ2(N`bU_&=*?e6=2v!CekE#NMl-LVqpn06} zl;{^aCS1MMCYBalH9oV&cQ~052?MMfIbHWzXT!<6iAUtL7bvY=9 zxx_8J9AKJK$^eDeLq;p3Z4W9M$HeFo{D{L1T7YHwu&Iw6cMZUkl_6SH+|1}#mWpXj zx@Kh|JYJAmcGfP+KuH4!B;zW@9m{bpHHSru>3)1k(@L5W-=2PZFsfku>d= zjfoO;gG>UMItfDYeAA))>n7!L<7x8n&CtQ^{4w^rg zi~R*zE6IW>$ydF)OO*tH4$!tJc_BSaPvzR2*zE1h(A0i~7;F=v(dv{fpnT4+(;355 z3(NK&%-Q&CD{D6tWY(bpuFU%p64|{V#%)8kX|Gb4T6)mZz66>=EFAPA`?zU!xui0D z*2d>Wj#E%ihTN`arb$ijjCC63Q`6iJaT-4 zJglk6_kga0!+`<>8tJ@Yc4Nv@RLTo2oozx0ABq#)!!b=tr5ZV<&ws@t4(HLKC3E58 zEB8qqSpT@xkp0Oz-!V_{>?38`kcTOyYtBPkja1N>UtHhlIHaPPUbRDXm1ru}A~eiy z-dpV;T<|`4b!btd9X2QTf@sl|wK&79h#A%mT#g zH7eHn8piJhhu%V8h!45NG)gBCY&c@1?Oam3S1kWK4(()sE;4R@2f|AN^9L+hfJQ|` z?RP6uQ6Q~8v@KCjyy6N~Gxo?)Q_tlqS$!J}ss%QO z+b0U7B73v>ew5k9)6UMsFiG+KkWB0;3a?YA!Ra`J9`zcEg8j94#&lE>W_uWuMk`$JnA2S# z!?CJp>I068N2IDZGTXh)7<)$c!ep|b>rjPqbe$~3Rm-bKw}DWgfv~B85ZOS;RCl+q z++EFx)dS%@TGg<0vPpe)BMa4M<4*R}eJvv^ep&|At+g`@b7)M^l9ia!jI;WRx}y-o zhd03|gZ-qw$3CTLG-vHY=HO0-abNc;o;lkM!1Cdx@CE%4*+v1FvrK@acdczQ+uKGP zxzmZ6CjduFWEF3Whu^DPy@Ak8vLohN{4~eAapwVW`8bE?v>#9v-NmtgG0p3!67{kl zDXnU@PfY7M^{pxLTF8vu$nLw7t+|ptc8~Qx$b(5#q}JAL57T}#ak=42wzcAGmQ0w} zK-k?#xZFs%(Xc9eTyNsN^XsOJ3m*1Z4?l21GveNsPKe#{OS+~U_4pPqnR^npdOdI2 zbR5PAtr`jI(3}l~Sq;_KU6OMw_;~!-WrV%d{XROrvmM7y<@!eC?qmT~ttFwt?bpc7 zT*RB!0vjohzPGpb{f-Pa_P)@yl&e2_ypd1}Ya~4Gc}s4#u9ZPa7P5j2s#SF=QAdny zQ)7D53c{Q!8V3i710K$c*0GFf9|OzUkf_=po9d-v!%yex{fld8Fxy#xt?;{Yh~-!aR!YL*6uF>&RO}q88B=fq*`!0t=8T21a)|> znqIhF+_);eDp8-6K_2VXXVGxYOPYavkXNCFPuX+TIT0(oQota%@4=bFw{pvA!(kZ}LQo#Qqf^I|Qo=7t{cgaI z4q&UR6jWGLStVnkOEhf*t1b;{`HpO>c2ekH7t{b8t3X!nhMumg>$#kTVZ$@_W0scC z`jD0$V;4KEWyZGw;%;6P7SlD%BeQLQAUYeqzTX5mxTb;37>HZeSYGEbj)tpy`r`>@ zhYtL#OV?yDxaJeEmD$qE-gx9s`K2t2FW35(3FaXj_6)~GYiBZ6tiW52{giM= zK#UrV1`>enML*@$9uB+@bIA{?K{@1+GEA$9h5#Ji2Bm4g<*=oRTj|0F(I`@av-XwU z8&cvnxM1lr2b;t_%9RHz?~~f6hrP$#E{fZY<3_6w4j$5N^Y+81EX}5F7y-4)T9KOY zy4t6xTsFAaE^JY!ib)o+KncJc@RX0Sfm-f3-WTgmy z{7f-Q^Z{8p0>A@Y#h?Zo_p4pPRjs9r(Zjx}6`5_i{_cF*8A{Ted)4bF(vCLq%$sl0 z&z+~8JD0t6#`^u&RFu?@?v59*1}#qOh3jzUX0 zQ$pvp;FZ*{uqBj#9VB&)OR-!b8xpO`{J>=GxMa*cBh(^XHD)l_UVPO3KN)PxhOHdW z@(yoi3PSDx++c43v*R+7r*_$-EPIb2SEswsA2_-RZx@mZhe(nnn3>$(izRFei4G5rszUdS(?x`|ssc&imySCk8s zo&H`{`AJ+}2r01_RidkVS9zAgeRg3ev@I(9*1o9Y=A{Er7Ononb={@y%R)uAeOpn) zSqIR)dVF|L8z>b=AX!L3+yrVyY%xfrfv)RH-%t#a;vKNK9)jnHwp#F6p)u+b;#_=4 zB%erl$HIPES4&3Z>jbXuU$uKH`&L;o*?V0%yzL#JXlFeh%xA?jy`KsM#$$gl5|9$4 zRen2Q(|^3>{B}Teu9|*u?O;R;v3aD!mzCXD+aM3kVjx%I!SaZD5>aHun{&7aCR2L0 z5s3bXEAc(O`SY5gWg}LGBRe;U>(++E+|@P>yk(LUqy1$OWha|=6iX}g-3P(@ayclB zhXXNOm+%a%+->8ho4>sT;0gxrp84-EmzCEQAv_Q1*MW7oPvnLC;#*SL^pK-{)eQ3S z3^&o?h!PUGfw1#VR&uRa$EY_e{mc!c@%;=@n@DSOj)2Z{Y1MOpg`XPPhMl;S#lXA< zJ9si6;cCL~+~<8JehFMP%HxJN?Xt7qe5TYaPI5z7XUV@**iD}j47+1NnZwXV-oD&% zLw>8o(QmV9`H+C+r0tc!t9=1YK|aA2y!uF8zj$WfbDYb)Oq=IuCp7@ex4*ErxdkeC*TK;War4An@(_&w&KLO%Djp56z2#@+NMf(w_^*`o@rBPtf80 za}~6#U#}^9^52}(Z_Ei|5Vl{4ffdx32zjPZ|msP`hQL$%arEj};U_U7-_Ele=2*L2J!c-@yP5x@2 z{iLiU0X19fNKz&BHi0U!CFo1wb0TwEhHus^;38-RQaFe%8U+--w}{Uds@b)(aCj5_ zIsPkLf!|7MvwwYj^`!dx-qJUI|6fbNEEse5ookn4* zgwnh)0kv(53l#1zmKuG+}!2)T=G=HCuMl&zq zwZf+F^srK6y^cg+WG3d((;VqGxXn~jn0xMUQ2arw|4(G3yikDF)R=>o5Qg9W^tAEvH>O7g8WQ^UwmvZLUrc6p9wkF>B0Ll>)U8<*ixM6p0`UFiA}5K!G;dnCo%_62Gm|}J=B3$(>E;Le zIRP49=I3LS1>Rsf2l+D1~Gagg|8_^KWNKV8v znl>J+`+RH>dfqqw)IB68quNR>gZ(V*0%Jn?p|F)@zFdpmIEslYusW324yQ4lC>(m8THcd4sP9GMDX*{G$Vg0 z9C~laB@I-VByr$syz(Q3R=dUD7}y%H6r1axV4JgZpk>RiNHfgy+1a#rk3mnpYj>6a zUh@m)VCMtaP;4I4BZbG@dtqxDfkNi>lu|)&eU7%eAB94Tg1|(ydxij>P}P6OsF5!% z5;aR5f7wz3Sd3(~DLX$1c%w2#ZS?~dc{0+j>4D=nEFCVjC7KBG{&+w6q=PMO14=xf zVc~ov_WfKmzbmrakL^pWt_PnN|q`W_ilaiYF&@0D}uNp<@QRHi_Rj zcqQj6V<^+g!=7;7+ENa5d;@-ME5JGHpSOKaK4nhOTTb%H+Q@z~sPYY=Dlzvjzd&Wz zpC_o0U~(U&ja-(Y)@e{&){l&-JYTLd+e^M&F6j;UBLlLhThg6)-GjK@7FJ5c!Gxw`$v5LT9vZmnMe)Te=17+4(go~s+013+Hxr=uXNnLh7$ zZl}p$Xc*ondDMaNim~XV{pvs_Jt^(r43=-Bh7_zzckpvMkFs!8X0>)<8em*aQ579G z&K{O_1E_KTv|Ql()eM;C`Au1X`@EufaEhZ!p&AE;Wu~O(28YI)5>s9J6%qS#JtCBF ziLo2UXt~wwfOD`6rOElz^Vp;)2Bm7)>|lTd_QuGAOcb-LIV9JSAY>@C5=NlmJ$kZL zbtbDIQPJPeRZZ4*wm=!+Ih&T5euaB8H{?o>3Rq}a4N;Hq%B`nItdS>1yHqegK(uF- zH;ZQ%IA*1{Iji}`RNOexn47;Jc5r8H;!_WnUQs->UK`(2qy4U%5h0wOA!rDb&0zK^ zJVX*Gj8kMj3_mRq>V_rYaNj<^3)~l|ZT^cQf?GL{tJ*K4<*^OZ zjlb9PDUa>1yWx2X$*nUxg*@&`Ar&hz7DSF(c?ScWF zEa_KgKUS(!jXS5B%oo|b z;c(&_!y_ATw`*MI`<%@O9h$^3YPPAJ`oF5cH-w%lXD+75+3Yh%?@(umMi<~ zWUSQZ^VO-^y<}3d7N$6yq)#ae6)$h#x_N3J14JIM?mDK3B<;ho)&&8CT zJfiFm{92@-$1@x$?Ar;f?xQ@Pe>S;4h@YFODf8Qo*;S>QHi@`7V5RklXN;~fgM3_K zyLv=&5^sA@R$*;tX4l`t0YSI`n^Q$7Bq3jW=kWWHGLSuWBYWDvVK|_TCRud$9<@K8 zB@{zWMtSGI+e&>MC5&4W(m=@Acz_u##>(t^AuuzH2z02(%!l`hd5*#U>46T(^Y+Q} zcFAlaHrujT8L`HR{)TJsQxUO60JYTQ`)TfYxl8F9U?0lHt7dOlL?!<)jvEOY+Uo?a;a8X=bXRsmr!%+pLFGnGA)5m8EmY)*TYy%pJeMN0N| zAMca8y5{Hmz3%&a-;c-d@%yLiO6Pcc&Et3;&*S;D@!5>FtgasCa69LYHDC-Ia7_{0 z@cVY(W|GVySGN6*z`P4v7(a$8b`3qO++`^qM{KJ6wMV0U&g1-*;-GK7cD@;XLo_)P z6EEjwQsP=ki8_UcHGC(VhFLbmPx+2FMdS;(CPZJ#pMNtNXd{Y0#M>(#x<))|91vy6 zUNc>LOIk{~fM`V6Kxb5kr3E|w*x^b9w)rcibmK79wBOlz!eXzR-KAhLvjP_?CL(bhvpO^xRw-(h6L$X*&^?|C~Yiy*mbaFyD*?BVdqRuK=kV_B_&@H*-obJVV&f1#9+O zCl6T$(c_YEy%&%3B=w8>GqBH5I%=4Ta~W4Up(_J&qR)h9IX`nJ?B(mHkXU=WFY1J- z^2O499J7z}!V-=EU0m#Cu0i3E1bj>bjbU9ENcWr=T%oTis#K=xGr_VedB)2P%qQlJ zevbSS^J`F6aEEO#txND8s#Qhh(}e3Rtk%#yWHWvd;i4Xw;;+hL9}B;s+NRIil$|{4 z{Yv>RDNw>OT9i&N{Jzjo<*6)Yld__UAF>M5dx6}JRvM968v#R&N4lBNtWBVz`ii&N zaN)UNJ_Yhw|N33boXnduUOD1=$3@bo`CFdtWgj&WVjUjq_I321jXy#CPQDd< zN=2>v4ByDWpN;jwQRBb_rcd#r-S3DV&3L+e{#J6MEGFkZM@E>UMmrQ-y=7uKm7m+B zE5uG%Vk>y34z%}MD4#K8?f+;+9M9>aIC7gRvfky+9!$I8-s)-%j$m=WvqvsOU;@`x zBgo%NiPAjwxo|}5k!=FWOb>a-iFH2ODp_Ydt+0Pfwe8Y1&@#UT^c?7e(02XW0hxVszhNg(_}lHZKlZz%^IK# z(VWONwbGt#beH_rQDL0GkhTJQszZUy>TNANXdZugZu>t9hw4!wJ?{<9IGjzWNkI#o zD>y&|sR3V}0N4Ke&9Cdnr3hLpvrzxL$L3s2;j#hvO&FYxAm%>Rd;z*U# zLDL%46#C2abJhkmY*STU`p@LpsZ3YkBIQzecm6ah<@?1q+WGIp7f3=P<%A+^9(pUW z`i=_?t#eEU9=*ouu(EmMTgX=wXRU9wemsNE)dGmk3yx17#=MSUyPJOMgjyG`sDk5D zE()2wR}7!}C~^oqT6rLMaVY-i)QISGtKen3EJpM9yG|9XJvSLzd6l}0JwL0>B9KF+ zJLl$Jn(IZO29gS%lxSIMVbxMo?QZABhZ!RT^&6*~ZZ=XRBE3olS;>~|ce2UH&F?68 za;V;uOVJm>&_)Xl>Rf5Eyd#sxBMBGa$Wf(Lblgixhr(e-;|)o4iR}k|)pEHS%Iwo( z2R`}Smpe|CeERtbiqyTl4G)X!D7>uP*1G4kVu!thRq$@9d^o=0Sx~&Pf8n$U{m7!i z+qnz#toHKStY$vgr~L<#&#<#i*V(u*nD5J|kmAmeDLL|M^w)C($zo90qG^Aas;K>F zd{Lf+)#<&;zC1?CSDz;yd!n@XRR+wmgh-#ny?jnRqL?EpshrWTY~Ran%0e@RAR7! zO-4#*w|a6KRaD>l+jVfXibxm(Ee0)#Ubd6)=^E2v?}}=|B%C|$${$Wy({%aQlK9kT z2j{4#8zPmUFmAXX9o-c^mv?rnR5D~5N>NSk zi&4ragtea1ccs54z*M2ty8@f6!qO^ov)Y>YGK65GQMr^;)Vx=ZZxUn8Tf98xQb{#7 zb%|>HxSiPeXQZX-va&#m7^`DfsK2L4`61J`cXaQX;um_rd5SD zQ1Q!SdRcEpVZ|>()z0C2gFK!X;g^o|p8SS`ffj?ICw5}|dHB8R?_x8td{EWLXX3gb z7r^(8w_JXgXoYIkZcV>AmU!r61a&a6^zX!WRcTvI&-pFCVK=5X^ILkeDD4>HQLDnC zfhgjd(Jj%6WlPjl#o>3Wbc$uDiI?3C8Z9jMLLmLY3=5v0v zcMV8i62>K*5#_ZROVPL~+2nXiZzwl|_vQL-X9CI7=+*v`qfO%Fr+9{G7H&_&hNpRZG$4I`FV z?GL!`Y}|67%FrX$nXfTzbDXyG=wi-lel?NN1m}F8iHduQ1%4l&_SSOpi0N>66?TS^ zYxTA+O9vm}IA4B-WF+65{OA+6C8~{Uo3~IFa5wcIov!X*=54&T+Tc`muy;j#Tx!EW z!oB>^ql`r=qcY*W48X43tfI@*?5ZYOn9?f{Jo|*a+d0CvMbD#H$Kbux>3BUgjTBS5 zbD}R-KCkE2=0(Ilh{*}R&V5_U>5R0~=C|-BvDzlOGh)_~OCRr!>h#sF9uWKB6HPcV z1(a;h)>wa;n1azU)Fe9f+-ord5|P*EH9ohFr@uvG`#Q9sUs=|plwwp`#B-!I<9#$I z(%@_qqiuzH6Q?63LcIs2O4cp|h5h6xw&v$jv~63KJXLh>&K=wJ!bKYsW%=d@zMX>!#nb|Cjcw+O(fD~PksS-s0 zf;mlQmM9rO#vvrXMR;(7AzB=L-#H`_dOx>dyS#k*OZI{Tj@66G5q+Wk<>pxY<5QD` zZylz5Z_?L)Hb*p=^hLY!nl1D2u1P5LcO?qyC?9tbp=OXKmS&sLZ+qvILT;^*E=bK# zs)5Sm8F!J>rIBeevJXg;`7pS(Td|MU-ly5%C<{HO<}rWEY>x+ln=L`YSFpM(&T?qc z@piwg-qTU9l#@uWvJERRvs;Z8*lcFJ$8V|r{Gn7Z)R9+EjNwga{A4+*EuJgQW98;C z)wZcNL4tv8R8>Gk7f@G~nS;k^4zZsT8eDzEpVxEz;=?W{UV@nImx#?#J!yjBtiy>G zMx=jW$%_^Iu3iTEfv3J}Rb;Dt9EqH{%#MYURhY?;w{4oJp-p9PCr&mJR2GFnToG8- zOhC{99U+{i62gqOCk4LPA2~k12|VqzTPL|21|B~BFu4&)>c*z@vD4Sr#xEUCk(uqf ziC=DQH<-vpAP-80tH9_$)T_!Ce)MJ|a*cEkpdCNeG8mL=;BPbS?YI~5-xdiIs{yq$ zP>6u{Rc-4Gf_&IdEe7!VHHEuK&$k)?Ab+dz2YvOn9)+LrWZv<18_eyW$ZR~BG~C%K zF1xIB&gp_HZ=P2FYQZdH(w&J5wCglHNRuY!`VNS)^T^*Zm46i6!s)W;-VX$36i)L? zr~9D}NU{L^_jfHl+g%mE=S}@-Oc8=<8`Mp5!~n>e#l|4S!u%%ZF+oSa{<me!B){^y5p0;x&i5C@_fR+yXGmH?bu zSpcY&fv|}R+kaC|u}58I7Sx4`zaZquC8$RnV_-SCa#|z*!ghN%P-0pT-9eQQ5Kz%< z?@Yz1q=~XRwzH!R{ygOW7X%>Ti3KQXpeJrhthDs77pnEK)8y;k0ML>7lJ)*T4rqQ_qp?=^^itL?yWsHi-0D7sp>|cp@@=IH{9Cfrz{_4g(&Op6vE} z_bf=#2~=fya!ImutW)&&NZh~bxf{<2J^AXH`fGW~`+f&_Z)(qbXO!K$UH6!u8}C(J zhyqjRa|Ls>wMDSjB;_^F-KC(e_9%u9QTjz$m)1Tat01DNS)0z$bLhC{m8?Ts$;{D5 zBN1<5(O+C-AXajT${|!WDBRc?VdorgYT@MpvR4^_J33`lKg#wz`DCWqA>RNJzQW^6 z0pX5uyD41WoC-S1;9oW0c8T)*<6>8+Oec${s0D=!oVRS3u5U|fXR9S*aRj7UWtheO zsTF@LqT?`phlIC!YCfFbe*tB1&W}k|*n7sHPfrNH9Svf>T*2OXuM-eahRv062N1lp zG}E4J&A9oL$4j3ut=PCJ_ELd{WZ;4;$&2FQT3>Km1FqJwuIE_f%BnA?xlUZNT`$@bW~ zXP!44B<0$sZ|Fud)g7MSe9r4Cp&Maw5RLU|-L&_%Jc?2Y7>a+WsW zMY8Z^HUm$F1$C8qzSiPJAB8Z&;Cdmax@_W6lZ7O~k7Q-E0eXxj3U&Fx`B|$W@LPjE z@VPoIUvxnQXEBd`E4|`%>+38gT+NA`n+JkqNr;xi?{yxS$~KE?_N*x|;?JCPApa&x zWXw?*e(D|W=X95q+6@v}JO!z&>h!OhnW9Qs;FY_5gJgY7d2(|NWj+viRz4@DjhrW) zYB5sSZ%$5A$8+A9d&AQY(|(QTi8Z&@dGh(IkNXHbs-6=!60@8o*~RpLNfc`%emTfA zX~^F+Z_w_Hi)a4jpsskGxBRYIqP$7j%C2M*k$PIFhzRF{W~6w#->7I?Qod^ET|qlb z>lDBnwKEWnx1o;WktJU(!ue8>>r%59sn#dgJA%a9-nuS0r34S_TnkLsb4m2usP2+# z_M8%33wfE}5%gxmGn-Db+~t$TQhPhlt^*4uMSZltu=}}#mUUUh5t8_Du^QG%N_=ZJ zkYgE$PaHj)fgA?jk%cyA`3JK_4OuxFo_5R#h8l@ZvtkFlFU!F{x|c$O^co@~;sjmM zaQz`_i=e^T1_E%~&C_s~ULu!1ykYRQuV^9Y*hsGCQ=?>Nd~W!bcSh?bK9_@VGCX(s z<98Df-ifr*X3b(70N8zCj)gUfkx5-;d2j9)6ORDdbPsYb^Stmq(;#R<;51})mD^Au z?7L7yU~1ZtU&RQDVY`YveWC>7ItUm5!q86`2TP1x%Z)oT-Wwyd7vLs9SK$c*F7v11 z3{r^b_Kq5Ka@i$U`HU2SKj;gOR2(Kb)17m4&tL$%_4s!j}^;${U2=y9bLM*hL{ z1Ev9H+l{iI;rfaGy~bbFL2y^NZ5tuy&(6fKMc|>GK3l=1hA;Oiv z_Xwd!e<#5HtlfzeZvHZAF>L!MSz3TZuvT~Zph`c5V8I%;a8lR&JW`5_vik>*cBjSP&=tAbh$OERIO6Le&`4vm-i7fqxdrI4G8LNZaOnKMFG&M2#DNOwk=Zr(7bB*RzL#`(_?X7vkMvf0Q^4?`p`c`4)u; zDV~kfXx{vZ_%8#VP~`abGLvy8Hv8=9q@hy*X@ z03x)@UjUVVC~-A6De*iCG%aVKRJ442i;W{`4+G-sP(Vl~`b2ELSvErQTrEB#zFj7s z*YCPEx*C?jgEZax>fRc>muHn#Pg>tOpT4owuZKW{wUr*kOp$BVlzI|tN6$azpXe|M z(sexZf{DlI$#Du{Yt->n>*6~7 z<00GH*yVoggp5=IH_mQD_!Y6;TbnAz95en5CfQdcE{tvlelO<9}`cN2E;%m6zMba6IF(2E*J*=ak|yz!v(Ldf$En`mqKcg5X;nUWST z5O`e?$7tj^#kqbwiDP|IhXnhE=e4pq%;Iq}>}uvWw&5ic&G)Kg>rBi0G*ocSFxu;# z-3Z02Nm}5J0A02ynL37=*?7lKW}r3ReOwD;I;FZGO%nQ1UrnV)sZ%yZ586;f>>6I1N73}frH3jQUR<{+vR|<`>0m;*ibfaiVKAJK|C2; z3yAEFI8By{&Z65(`;2Sm53KJ35nwsM z?1+llQ${NfJO!JM=~fD-X3+laR568`H-fjx$+3gh>|FRFrZYjXJjh>X0i0~v_tr#}M?;_C@947d` z$>U&y|B~egu?5H>{1)TFM-Ug^Nw>ftop}uS)`-^tKmoW&-QD~rNbXPi9}$O1j)4P$ zJetQa7k>)wOd zp&-qSh$9S$qW|cih1Y%ub7&FRteuQfmPtun-n&mZax^==2E%Rjjvq=25#zYN1bRs45m9T3za&MZtg zWnWN44f!4qbN8%(QIyxZOOyx&kku~P;1(335QFLd&8S9yC;;iq!!SHWw2c6`{!8y3 z9Mtufi{u}4mwxDmSs~BJ3YHnCzyl!}(GEjPK%RDr8`(jga$pJpp*v!3A`tk$Z&rf@ z$H+JTQR?_-7nXO3^xwAAN@4awRsd<}OE@lNqkahHKmyxZfH)fcO>X!XG{2p^1VD9! z5coaQL~M-Um-_A@gdBRv;Q=N_+qM)tXWs*f^Rowq1wgnj%)!Uz7ax$DLxv%jj@8k8%9Mct4z;5tKRMr| z5Y;3&sObHyF8e(n-cE@AX0C8~Jls3KlLxyA)*L&IN{|xapxOGqQx?jM*PtDG2L%6r zWf+3gFW_dZ1=A25GW{2a894BOFV;^@AwNy{4`})?_UrEw-Zwau3h7iCBZBei1~6f} z{r9(v4sLb8GVn_)4uE>fA8mF3TDc*l_&?_25+vh3a)UyMtD+p;*p#AMfY4NpS{m+E zmk~1;&Zm#M1D_4QRN5K|qT_is4qb}yI}Rmk_wFWA6id_5#m(C zx$&CktJ#9M+q737`b6A-q5$uo{zQp!Arb7t8ts3^4x`^eunRgYj)&#Tg?W;kS(PV2 z`5mn~U=il0+^(NA$$!^L?y=QGnRvJDv+~G!JjHZgOrFx!Gfo4}c#7GjEBv24<~$fy zwZ>|<|4%Zx>ITn!CZ52$S2iwtT>J%&yJqzq5a*DvZWipv;{I>OF-_gdW+YIF{0Z?# zY`drWw7{Z)buC0Svrcj$eL3XyIdiFOX8MEXP4RC)IyZJ9g2wC2PiZ6x+5A;^E00*wDNsw~&DE5<`fRr_S%v&N z8Waq)A3a`wVsMaMgSPgwUsWJzH-n=J#3zt&9&tLt1xY&yxQ0aT&3MFsCuh^<9%?3g zx9|nB6fW)dD)`zV0oN=!69ptkurIS}iRXF}3`%6bM%xYyn0c7PqHOZ)we1+Z)^<0} z-8_OOSD2b%`B`qbxlc3Fu{X7RPV3Goq%6WJ;7zG$G2L*!eZY4 zQ_?@2(|__r9GF95RJ^MQr(<~^^5^$R0=erT2)Er;DufG?w$V$3jsf;^-|a{slnAuM zsQ_XCF>hld7?A(Xa_ip)oj=_z|IHX2s-D01xq^!7|AA`@Zoi%I4g};8bGW~c!9#Pf zvDkDlXz`l8y@Q1#!nA-vJAIsQ`&U@-qTqiaz7wPQf-?wQ%rO$gMc*Ki2kG1&qi@6} z5)u_`z`sJqVYbQPV-eB2$1;N4nnGO`)7mz`Z6;f(M!XYSXL2B47 z&y5Gc&Ld>IY}XMWJp7N07I3=zQPtmx|9{GBk=P%SQ!E9Nw?b;Axvir7|1cZ+o9%*# zJi{VPhbkUoXjUP?Bf0~8mwal>r;pzGY1SnZ@qUj^R1neD z^H980MZ{z%wy7xjD94cy7N-z*LFC)E-IVd3j6d~a(n`#kKSul7B!6pjZO3366mGmO(6*Ci z@nAB|@@L9P?D|bK#o$eQgxx1&a&X3gcEVG%V-Ldt$u_?~-JO%#^+xwDsq}2)W3lMU zVT5K-8uBZ%Rm=Pi&&4L#793{Tn(UHeC~qL0=+k+NHtp?wDly{-kC zwNUF3&Q1Gp#^xrXR`Mh5FV<9Kp?( zOwmA>xaKK*O~*L7*sR>xH|0&0Iw00%LR`bE2;s%-QU-b6CGtjXSFIBPGyMYhzZsOQ zW{EY6m}sYrtE-@z)|-c#K=!y@HKqW`))H+ECA1fSQ$exW{VF+`!fwB-J){H^;n|Dl zoq_ZUsE$Or8;Rm|l!YcOHrzn_q^U7Wl*hxk@CqqTK3X}Rz_6zs(j+vLJfzU%1aZ95 zoqG;-<=jY)Q}#qr&kD&0NETSKV{-P(qxv&)8R|Mb!a=wmFOBIHe;eVgR``+HR_P$| z9>LMiL?r>Pu#cPF4!LUJg2W8h~giEz3Pj@acyeoV#shXW&iaWm-iQ zAz~a7?o2w}T0x)a?l3J&YFYNt!K;jYQ=45!Ot;dQ5p!FNl&z}%s+q(^$c@^(Y&Q%_ z@)yb5nHJeJRPIBtFBocm`%TSY$My5(c@O+g?V@GnNwYaB3^bjP4ku2{B&IWk`)7IF zdCQ3!btYSfp5UCHa2=uim@*G0(QDdII2%s_`G>@khN1&UHlax0U3Gr4T%Ty5lRyH8 zjDz>!?xuXIg4SYt2CStx;)2n%w{H{o;(XDqF-L`G6l{1tQa+Uk%WwYpEiP-9-@vh} z+~SA{{4^7qHGO-N%SaicATJn;8Gd=h#OQZjYh5{~-UHobj>@IT=w;JZQ=emLw`$mi z+pmW{Njwj3u6@NO_FW6$*K_W>_OR?x^lFyeq* z_KaKjP)$$_SY73XIqKF2xNy~mloQ94%d@Stp~-DC9sKolm9@J~<@c31y&=EM(uwuy zq*N5PU9LGyb;x6cL-&v=L8-d}6!y%zsj2YbCp>doTtwm=tVVcK>4bcb_OOpjJo|Ks z1wC;%7?s7s8Emm|KoRS-hh1A@S7d z)4;I<@aG^FZ13^uL%8(%ZBl1jp*t@YR{P@lAi0`Qh zuk70sQ+{7M1Jg~7jJL|}zx>j_oy6ZgSc{^op~r6z0t2Ohte!1(iXU^NtgVgC%Cf)5 z6mVi7Arg5OvQYg}Wqc2{4sXU=s`BtkH+YP{`Q86(kAX}Y^4kT;gL9j(iRn~xF_&`- z*ZO4oriI5y^9vqZ7zs*bn2PO+V%k%=tu0og=_@*&!v7T1T~Id&72Ox zLR`<0neEB5V|hv7B=H99Dr0x&&@9**3$B5Vnu>3H$kkMsZ2{{;I(Jg$BS(ByHEYW*sWjY?K73-a(KyHHp1jPd=BKi+6vNl7JPk|nMh6=;SS+r@cG%+C($Cpg;@`9@ zc#$Dw!Tk!Q1_TUMW8O?8S-!0An^C=d7eyO-$fb*I6r6)f)CWZG1$21Os!)f-EcxR$ zag`ebd&z9=u`gg9>%oAweZJP7Q{2W8TwgcBpo@k2L83*Tb%CBHz8SmPB~!fu?U{lj>HXP(N0b>fa+G-R`IzTDmG$Z(?@GHwbM&kpv1^)ua?xHr{JM03{x>2CM-?=Yb#jFT#%o|z z0hVbivdKr9Q${W$dID(i+7}$ra z-+G*7%~)LcsJF!8_|BVxiJdQRBy>}-(mCGB=gV&C8Qpm2PL69{!>fQ}&$pzDO=mrb z;~NbrT_+l5uQM87OSdX65xMZT1jVjKV?k4QHY%;^*r+2|ZTB9D&>6yZUT0VWd#O%O zL&>VA4-_cJ?Q81Fiyg~};^ycbDdPx3_=@H)C@_}HC-4b%H!LNM9xM=Lq&J=u7|&0W zDp>*@se5?w1A24@ptv}m6J5$7UkF+r^Mv882 zQUy7|5b%=X*=y;H>D@3x$fZSGOaQkHjPY-`4IJ1b)UNi83HA5%Ekn<$22x87Pz&!0 zo&Lu9eSa8}uy+gdYZKJSlex9<1*1-wKHrN+$fU%Q_u2){lU1HlS!#nkf zubiMzY`iW1J_)|Y;MyHSlm}?Q`a8tV`Sl#FvTXoQt8+zykG&101*}6RuEczfsk1&# z$P}Sv#zlVEyV?bK@*Bo-b-a&xatgc5k{ol>YJW*uPTwUhVK;WB>Knm_p`qXn51rCc zEh?5Mm#7CtD`U}t#lK3>rF!og%;D+==~UyT9=oQzwXY@pnDh#oO*BS4iPYp*`lIAn zQR11N@x!wBZ(Y^ib)EZg!+6heV)r2j>-q;(yzYmhKl|y1&!rrB%2}ea@`{+YE-Wg; zfLP+F6zysE8GeJxTd$P$1_V!V3^A0RS#j?aeeM2hjJC&~51z)H?_EtEytSWqT4>YQ zKzrHZxf78d8VT2Pb=;B|5GuLda*EdRa>JV3r}Ot$8usD_YF8+R{GuEZW1uI>9AR7i zIY^p0h+M~!(E=~u3x-C+u#&lWB2faw9o1ZTDKN3jI}?k$WTmu!glKSrqMe@~Jlh#L zPW^)JmRbh{OaaG`4LlS)ae{Rf7RYZgkt|%;4x?Bh;?Aq#?XyJy4h;Nj~mBIJ@vvL^!0Nu!9mU$CGobdJB$; zN=^^xo9YS?n@jYcf3+E=qiSj-g;08@*PmQ7q7Zq)-i6{JTQm=HX@tnbg0DFxAP& z+92|!`X2fIG0uEGlT@{u2|;djB#(z78~pP_I1GLbl0P&9OJ?|md+JY9rwdb8wKH{6 z(7f{n#8ab!?(7~TV6<870kN$weYd|ZG>w3?^)>u=?%M9`X(6wS;{D*P(jq?$+vJ?y z)!NhSQ-{po3_j)VC;V{5ZDZjJ%ye{+HKT-67IEadWJyKnd-qx`<_GwSv63$N#1$R- zKE_vhLt2-59pZu>HOA4`TZzw<681@^g-vL)%v{pY9c4=q>Aj&wew{uQFGpG=JeZ@L z_5jjPBg8kZa#u*}Dxv!ZlsrWzRX*^QG@0|TtR(03*+%HO=6MybBNpJp3nO`TC2PH3 z#b^4A<6H`3W3(G3mUMF9dR-H+8Qe8fygcORA}64H?%9fYYSCCk_n2d)w9yGpt>}}g z*C*c{o4&`vU{#pTAmv)P)u+r?G-6r2a-BS235H{AVh*^0rPH~?fs&L_P81vdz3msi zhroybQ#7`-;eMVq6L#=rAQZ`HYI=Fqyx&$C;;_)2WB`lN$L~rXMk%(dUjZ(ZE=!nOmEJ7tzcUydl z*4+ibk>!u)r`qk@4F`@>6v>HG*(e(xr#&J!I`nz>B7vSyE|L1z^eU#!9Qz~Zx@0gNImRqIE4S+5Q@m7za7YdBZ98p)}Bt6=lLz8dj%)# zS|sk;^BBo&JXz_%?ELD7cRRyYESyxzMTj-5o*${Aal)~|3Ya-5E| zO&<}Vco&8y&R^e5h`2IYcid)3u7rB|jjVCat${AQkoV^o>xh8gOJ(Ov5lnV!fEW6rX}1xQ%mx<$s0^c zj}`=wtl!}P6Lu-veI@zG1O<#}7+XNfKkdh~Y+5{_AYZwfDP~Bk8aLinPUl8QOjh4- zHh1g+Ag#8MnZhp&I>)Qr%P+E^v8V|Ob2bP? zce&dR6ef$Pl3l33KyH*?w#!}4@NyB?DP7zxeEH_XBT93xN}NtSMeQ%-Csfh2z{9*E zu1ju@zOcU#&`IC(3CW{M*VuL|LTuEbw5uy2jsNzdMZbuC<>^`(=4UQbDn@DeuSRos z?`s#kF6eemhs*362>aC%b^<}TTq=of=CuCnVWkHgE93Ztq4z0wL zygt_$ik*0`bc(u)bP(nhtiy48diHEG0%YowOT$ekHqlZ3ooC&S>sATWG*Ba=6xO`(Xf)qrPC69{U=`S(@w>QVmqCbon$626>qU_5=l-p& zIC_=X;rn@JWZ?<{hei*hot}``9XwG_H7>QfN7;oi)aTKdJRxhkd4OUi*WKbQ3`C#- z8FK7`3i4BQ5@}_0QOwZ`6SWNt+y$GbBG_QaJMiAv#d^1dtw<)Zyn8OtF@TcZqT|<> z>%CM)G)?)8SXgw?Unn!)pSXR9Mvh3*$#S%WyxO{mf6JRbRjYWEzbe1oMg|zQ zztt&fkM2t}trYjBt%mM z&n3ZHxM9S5$?6wB(pB{Mf{Z6ooAyDet+8Cc7S9c}4(X2_<`)|8%_fQ16R#4vNC>m| ze0$nwZg4uZ9KbT7xudIeL8lWTFCre78Drsp^dc=}ANw;Sw&gbb8l9?&`fU1Vxr;tP z{0t*C@M6FJX`U*qjU(^B}TWF1!)%QO^r z+FGuXT5Nu(sQZMwf0%%t?o83GxY6`(fyQ2lDP3zYuAUR)@v>3XLv_3b^?c7%&qs5$*AmP`qbd7MF5EJq7rAbM5)@uxN)?Wd?z|N zA(O2g2~7IV;qs*#yrWS7W=t*bg$3j^{F=Z4JVT^3;WwHXn9#H3SK0ivjiPIw-X zA5W8OWpf?yUa_?a?4DQZ-i$DM$A)KE=}s6B`DAnq3{j>r-R9JM1Iu84AI4_n z(ftNtSO>fpCn<-YoQ70R801#yR2WS6lFgUyWgwKWX~#dTk2-LMHI0Ihp-!KzA`?}6 zftEuMuZeCVij@_83SFx&tE|1ImbR6X7LFxah6{!wSFUs{&3bkI6BGN~eqGj3j|bd7 zXGV@Ii-X)uq;l5h3R=C-Y~kMZaDw?YDPAv@oBzV-&@+hbwiF|e_h}z+-gjX4Btf*J zylt|oq_hEwyuK#VZDg3b>Od2kqw7{Q2K%jqc!HM~)6g>oOs{z(Nier}9ucVG88^n` zb1QwXFDC#xRm8{_9E%G%LvOa*UXD@kM$0LtU213JPoYzXrE|YkGW51-=uOw1eJ9Yx zXJTqzN*uT;PI>>Ehj`TiEM6@!vvZn*J5|k@5B$@SO_xlRS;VJ9`wRIoT_@)JM^G2N zu5=1i#RPw?!0VwF#{0xgd*!PN?Fa5tXcIzIZT6yHxUZS%)M=FqCW#5jo{6Y6Ig?Ck z*E6+EG7DW#P$)b8xnXKyQDVEzqq28zzG3=sk6%Dutx~eGJ8}=E*2D7{QOd&#Ag!rO zU#m|M8RQ)3I)eMj=FyZVu70h8q$CEfa zCO;@4L=N*235Qn@jmwSB{h&cHl*%})>PH2vM;R(OPG%`=@E z4|6$}!gco@+Mf*l4cZ?%=KU=x^U%J{=*YYyqrrL-F{b)1rI3`=(Et{IZ0`eEjuQkm zD)~sjgC7(oHW6*sf!+E6*KV>wBDsyg6HCz0C=zKAobd5IqW{oO1BEU9sLY%9j_>7u zATpSTZ!xhv9DQgLYU8>EKC}~YK4(_E>lp#?2y!w~GK{~9WFS&yoWCr8Dfqf*9pP;l zb5zto)ArHmYnxgo$mtrS?PUf?tAwp1k0?4e9v>x$wJqEG{K%@~utkjtt113lX$#gc zk5Qf6_1TD$`C$94jisg05!RuV@wg>rsisK&x1NSq1?(8Ym&bmk4&Dz6l(xlLLiRe4 z<+vCO(p9$KX$1j*0q`sr|N3sw;qv`xKd#A#p8Z8jecj0gU-BjOon=_)XU^r}6ER@2 zLiFy3*|y4m)N?D++qdS`?%2q;}7rxhC5LHG*HyB58k~H1g*%XE16WPZ+_xXB>YZ-`mOm&jkPg zv)Nwa=Y?Y}_ zn$X+B7jpP;JuH3m9>p=b!?TeEQXbKGN-=d|eg!Y_1xpp;FStj!>y!>7X`;-LU}}^7 zub3b^EHPL^kU57?N*t5IVBceN*H)c!(Js2tmWeXq$?fe?ewZyD?AmE05S+isr@Gs)IBa-S!xa) z&1~6I>bBTveEX;hqmNF9^IXF?!>J=Mv|`(MTqhQckp2Os7dy^(QO@=OFETwyg$qhD z-0%RuBC6Ql-iA zRz2xX72(cze6Q>0ZmsanP@gY)l2EqHgX%dawUM>O!&;qdvimYQvt&;qsJbsg9Ududr^svb2i#OZ{8*IC!O)77eo^Yp;K0DOeS~{X} z#j&C!!gESX;~PujC!j*lEBo2;?`nRED!o1btDY-~sPkJ=k!69o7~|Tr-3MgOIC|4v z=e_SbQ7ssc&WR^)DRU-PBp;Dyt~@;HO?t!7(*Gm>KFucfrC^Zh)Ge*?DEa~=#qA+VWNut97fhgMg<64*m_OZ|RoeyanRnVNTVT!;L< zdhl_C&P7wxsqZz;#5S(J=k6Q*iiB5u_qrqL3If9H)Bud2qOyk6Pyp>5nK?$%$WN?) zruy3Coh6eMQVTp^b4`N$(LR;0c+x(WME;OiAeGn4L${ls$S~o3Oj6KfvG%o{ps&Hw zZdXP<5O-?m+4fCvAZ zqULeHJZ+orPMJI3L#6{zux1flefRfsg4D78h-n^Wh2m~R$p{BNYXX8__E8Jd%_#B* zt^zW40fC{XRYQKRBf_{KkD+iIuSW2C{Y;(su!-jRAu-_4yX#dt$@F8U=cw^=OgxQM zGIi{Ed`8RX7%2UO%{p_s8~_OT<;U>Kxetc)Y$FXlVQ*0nt0dc^FRZ*1 zyD@Pfd!N=mDO;$i(=$+qpq%XY>D@p|qJY+-vrPR^giM@wdhk1jNQ4Si!%c>$N6*V; zU}m<ZA5g)Sv0>Z_jZYNqgDazpP;soYSbR9bvHi{kvQRHl2JDkm?5g8F zckL{j(T5uC`$KW4m!n~ee9}GLQet>$797g8jXnP*Ou4)d9`(fFsusof8hwfzTtyt; zEAX=YxN1H;Y8%@9ysdWS?~iQLacTrm&sVWa8Vbvv z&wndZgi$DBH~s;r{f(UP4^M&Hp{#Jr;@}G?IsYZ71Sh-GHhL5qjOU|uer%TCb0)TrCkRudG5~kGh^z$!7%jO3 zOPnebF4qyHNKeLt(S51_d=0n*;r9G7M|8Ku>?cR^Z+y}})_I=?iuLJuH4?WM{qsHi z{cd(Ww;OM4zr}z}YXQxv{$fl=gR@XO6x2iB7cI1Fr?_ox7VCcI%(mi>n`%#vp6W<9 zm=?R_oXHKpIp*ziyc*FPqFZV6>P34QplD~yc=E=pk9XrVD!X8r%_0ca1$RS$4c6vA zQSHy+_ae1C`vnG%VQIhKIVjGdCzSrGRR8lZz~X}#AhogMNkX#h+6&y7Ov3IZ%#S|B z&VWWe*vhjTcTz}1s@d?XC2#W(UQzH6@ELuv*?sk_LuRJ z^{uUIR`jv>JWs+OLd_T_*U zgOc@v!V6@92TyRdb8em8)Lyyy?P34en7i45-0%%wtSv)~!z=CVu&pbj#-B+9V$%+*O5^ z!V<~)m7GG07SDcd&(|~3)PhgYp6mAd+G0Fw`ypSig@YyUa>S5v^24C7$Gz&tcRd_^ zWnVI6b` zP>6DF?{XD&OFUw+AEK#_qprvp!ALxGn$hH0`&J~5A8L1d<#xr#75tX*r%@ zO%|V;V^k7P>RO_OPSahNSUv7mKYpz0fX7IDJS;-e@l?(o&WZYpYD0H@U1XA-WU|OZ z9OFa=sRVC;Y~Vnd=QJ^<zrG0A~$<%6t=VPw zzJMsjOx&f(bf{Vukc#xAO}a+t*!tX_DI-pDU2IErZNsFM5nYlh03L7;s)mI#bjDN1o!AcEUjo3@xsGw0CC}w91id#m+U}N(MwWdTkCVZNTrpt zu42;$$~w9B?V2wqC1Dg|p=qxw(e)-$FZecGrLplD;UEM_ZQg2YD&Aa3Ey!-ys(btK zbg(mII2y^_;Mz(mV%d{Rv)JtEw6W6UX8DcY${C*H26PA}uXfTK#|#Fw)NYnVlz_NH z0U#?ylddo0T=Q>(P;-*&&E-L_f;M&S5@ zQia$Yj+0gYQg!*l^nLN-C8@1BslL~j^T@mXb?Y^MSs2KCOD6ValGMQY=9j#2Y;M_| zJm#w1WCdnE=}BGu4G#2mesh&Wo3uLCGHu084&aKND%Ah{e21{ca@|`BR8xRfA*ffQcTDO>dR+Gp%^5W`)Ef ztJE$@T0>g*HZPO1zyBI#M@>~#^CH@Ea=aqOYBv26OWvzc>5NNICci))XcQCCa~!~Z z66z)^fpO>aRzs)?w4=|FaJb)zotK~M*M`LNLWSWAu=;cn(kYH~w?l(LvcuXj4XbIb z#_;@A8Uw3sJTEf*wT>2bjB+X2{5j zLb5BPtdL~y5l(w#CrPqLMrO#$-utACY%)&cJW=-Eo73<0*7)?f?$32y_xJw&@jLG0 z`1MbY;`AP`@q9kkOHzON37`;)b;AbVk+kxeY;nwgN)^EE21p+X#xUQs{1whS<<`MH z3~vdku5mNpWNPdCz?-hOMWZ+NfwN#rca^EJ@2XEAQe2(lgRZ&IF$Vw&nSB#BjvcoY z>?k)8P)rU@UC-F9yq?DupU;a8UPiir`Hchztg7|jKOLfJ5XhT_zIC3&TJHOk_ndnx zn4%xWxjhy#(-X4Htd>d;+i(>*A~ADwIzll)>@mqV`fP%Xj^p?%unUi!9lsT<7^F$`eCyiB>@d&%YKU4^!K7-Y?5rEu#txY| zpKBW8Zi^wM#d4RXpj8f+t9td$+Cs-(2j58cwdV%esh0O1JeEOT2edWYqk}JFH=hT+ zb+G_S9Lv3_sJu!de&$Es?z8oBggHlK{CA6S56k`ZE2bhh@-aEy3aY2HanKJ+bfuT+ek^$F^+F%d)YH8a~Cy#e481A&KtB4#hZ~k5}sqN-AzP-ZOO#pKxJqTMDuQw@RZ4iuq=;9v`>vD1kWI0Og#( zlDBX*n4k|~>+_YAl0Czcu^cxp34z)4Vz~esz(2xoF*lzYe|WC*h}&ZlP&Y!anF>4% z=SfoVT+;Hzb#nM-vKM=EU^nEyDj3jPXUbRzuwc*hMOUWg*B9KWDt27ze;3ra!-}`z zJ21W3xKbL)!oe~Rv@GN@go4U@4D08opFm0Vu$Lb4Z+ePlWvuh#xD?+a8zuT4nYUF=H$RoU?gXPhzopYSSfh>4UKGCz!PwMVu-CNoQO%JBI zsEUwUs#e#$2_uQ8sTypZw-UlZ6(VpRPMgBaXWDy7ik;X1Sx;jp0+b7dgLK~`oL$=Y zeD|nk5~2yV!9NU#7+g(DOv@m!yVI%1O+LbEF)2gdTnsy5K4mnD%q25>e^Mp zrPLJc5q;v$z|=V4V{6^s*d9D$=DnawzqM!Z_VnYddg9M}DYh!1rKY_DwvUS9A@^3m zo1TbSmZp(I#j_`*77N}+a5T$1KxlPAFlFP~H=xT;!rQR#`~d~#_6X!P-^-3_#EVO& zio@$+YzhWnP?g1v7`Sb)9+m|t?CNY>6#o?Z$dwZ!D!m1G0_d3T2>OUtvGd)E;vs+I z*DbFYb)sKwodmjWKRcSqcQ>1YQ$FM@IrBg;6&{4hLogLUL}M@_jPf5So@4TrH3bq@ zaKbR8Oip!qk8=?Dw2oo@xlxcBXGzHno$~a6(h2CR!r%BDI6HU)<1ut|AROWD2ms*z zvZws+{%yOlnYCPTS~BX~d}p2)MQnjulQD#<@>LgbvWTB>JaVg70aPY@Lh}60>F@{1 z6VU>w9jf6p_S#uUT}_)WW;;v%!?^%DI4N+wm(MWTy_gw_?h@XK8v$xR6d+h&pZoIx zvLf+-+w@P3888W)NRt3|@lQ4bZ<-})#v8|wdCvV&A7JcRe_!uUyzN9kJfkWB6sf;S zsUL!RSnMZ=>fi;a&fn0$GXUo}oO}o%#Bcy--n<|^oeKCRHB%C1dHac`K!U)>2An6b z+~dWcCKSkX!SjPa`FFZ5f!Ur>aUQz^AgFQn-BvS+zxuyiXeTqU_515aKi3dnZ^M04*dGR z(7W8c-#i2a?EOT=asVXeT_6tFz<2+G*n?4Wzqq&r2-TP{0&&BvDyGPG=0E0Do*`i> z2QuWrS0DA*emuC@Kd^cu0Biwc)W1XKAQ*ekz-c}+-NkoX!DF37g3&kkn;jUm=YONY zC&nT`WZe5TNdB<%KWpZA0Fpf5oebfl5?@QFXRQ8an|v zDUhVAya12=X&3t~F8sGo@}I8d-?-NRNc@A280P=?4+bP&vC>y$2I7Rp+J8BYGWtCe z>@Na=zv_v=|9^Zj<0qT~;4gmo{{D~A>i;{vYtY)YHIrtl4RB?K|7GF&V;=r`Z$QZc zdAz{O0+u<`G14Df0WjLwn)aTEHT+J=;7V6wbd2DUspYv|{IiwzRj-vkZ(zhO!W!A?oY0j+H$|L{zqrTOo4<6ji; z4=Whtas^MG%{Q_%w>yRXWHv^Vj^FeNgw|V7R0FHh*E?Y4I|546e<$<@{1N1mqEw#- zs}|EAs}^`rf7?Z&|KfnbwpjdigmFksEdI_SQBQZqA2@4sT=S{N#KxVpD%pGMf1YWS zomB&Xv!2{Np{y23ICu)j=%a<4HwgOPTUng5YPgiL7%VbykE7(xXnw1Nh2au&6kmBr zJ;l1jh;E^**0avXHLNOHyGNW_m3q?)FAdw2fEg_4G+Mb*oRT_l^&fVHg8==cq0@w5Mee%y2a=x{yr0 zF2Q_a^0Xzow(~G@LqS%n6`ZUgVn5rR;qgGr1*8mE7407>fiPa`8@|bOR<(O+8n!K0 zQ|7~xsf-E~$%K_QgG+~X(frQqhAZ77;FP*UUZjRq&D1L*T5YG}ahHgVC+%yXY1zGe z$qKNmWFPP*VC|Z;!ul|4_fAdrG^*XO=-6`iq@#c;PzWB+{(D{ z+cSNaM0wsd4Ua#04+Azojg@cTyQqy;{nA%Wum>__vURAr4GwWw-M(*bJn`f+AV9y0 zHcx^cKf^#`2NK)Us9jU=Ezzw3=7eObd!}~h{9`7Rf-Z^L`YtBHa9gNu9X8%P&y1lI za3c*A(AJk&QhlwIRtAj=FZb%TeJy2et)DzE$W8u3@i;5~rsC2AMe_aGK>pEL?PuR` z%9ct;Dj^U!6T9r2=NnsV%=#n>ySuU{?F0GwvyZt+c76y@pX=V)lvt?uIX~gh9u0Z} zdB@_{F^B0aQWRk5+7WTm%-1-loZT9ah)t%_)a2=GLN10tF9#fbvGMhiwR&V_DhnPz z%qCkpIPLy9@{wEm`j*e6dpM!*;Nwqj2(saJnV$K?HH>RVzAqKPz%QZ2g6GNY(r4hk zun)Kc#NAedVBvvag*XKR%aGi3Dmf{$T^%w?s-l20?x3X~7nxv~OEWQe(kfxNSi>mo zK0i}~#z=ebnmhQtb`&Cpodw+E3=&nqw|h9R7r-DjHcmw^3M4;bR85+Dp|P7GIyLuw z4@C_)5F9$iI`JEvl`Ct>C*vP38|-qo0@huKX?XrhBH~ovVRJ zZ~^FRyY@A5Yy{}zHt60p?1n1$`AkHLTu6C}f6OrfriJWFM`$F0FoIBE(-46AyNXZ_ zz~bd$%Nbw^D-A-+B_OJZY$r?u(p)49;~hTyi**Vp3#Phn0096?8`QW_zN!}_13uIL zVNL(jfnSFU_RTZrA*0@pfC8Y@>d0WJrhechl$ps3ru(vWy0vTzM@> zP5_dJzw!*?^zT1~;CQIyS05h!kA25~Fvk5)AJnfW^Q!>+Q7k|-{_DyA79v0lS5^RZfY&_W_sAsOy#&>#L1e*J$`HUDq4 z+R(c4+s*??bpGFIKEOzdP`r0yZ2foon{zJ331HdY8xu%w51(s1B)Gmj9D|nq6vOs_G=WG45nT#xju3RU+KwL({cU$h8 zzdV!-0G!Xhe6Ub3v%`S;>_n@{`ARylu-HLF*I{9x!(r}gBcCQAutq;l{UbJ_64a={ z5WMpnDEXuM1G8hchX(kqCil-Wdl=E1y^=N*IjAS{7>_D9uBPTOe0_XvwcH$$n=2Je zgD$Z>HXHd>_d0ZYJ1+NsjDnINz$dc*tZjc3-hVAZ&>H|+Cz$dNcRnOOf=2PXG9`AI zf6aslJQe2U28uO@8z!ChKY^JdLeCP9x&o!#ohkAVhRe>Y_oM5ptMe!O`( z@Dc?61>AwY=LEZ9_t+wR824Yt6Yd!$26h;b&mBB{Q-AI64F(uSiIBwa-wq>fFK>0FWOmNt`xOZXVb_K(NUVg(>u)*+BdS z41X6iLH`Kh0d*_TB-;8wLh>KlsdQEg2qgJoUyULI@9sbLLBUk{K|iXCpf90NU_pUN zFDGl=+zShg5a1~vs|4l@Yp*FrCH=k(MUbBqq%rQq2G38jedGZjc7!^<$7;XZIDr}( zO&y@m0*bjmqH!QJrS-e20%qfh$nrm-8sK(D_!^TkO&B%g{@B0%yBhHM)uWRU)8R0z zC`CP$NJ#m}G-d%zN5TEs6x3^C26d0}=T5)W+joD37Gbr_f!GY-dIWhtKSPj!>#-0F z&XKfLFvvCJ05tm}h~$r;1#m+Gq@kLC`M?#DfAcRB^drmM3;vVjg`Cbmb;$n>08FXz zGSuxsIgKqrLYWNLlDR^+)anfHrmkGuX_YBGP%mB~$L^^p55>;(Zc5aa+V zob*+{8+xIpB&}6<`3fv*3zL=3x&XuGSzeeyklzrHjm$qAcnE}chmAscZ+Wgi z4Jz;=L+bUyJ&#>JelK~=A@v60xlUmS9_?@WfA#jS0;$>qFz>0>@>z5;%o=gk!B1^8 zhuxti@yfslnW)MUpEF`$wbulvaV(V9ue}P4w!J=%0#5$r5`?xL=-o;vg#~g@yqVJdSvsDEw#;;cOT^1+M zHfEQp9eK1D8B9OC{g4GNA@$S4CofW+-;gH9EW zt)&qJBr3^rMz+iOo~2My zMtyXL>T@b80&WD=E`E8W0kFyTG?$kKvVtQ0A(0zxd;fx5AC8tmuDjBp^f&kiPaAno zZ(0gZqkuHl&2gbG&H;NaHGa0MVGi<1(#2r=!lQA&pAvgx(**7g@0_=jEcV+EoUE7SiADI&0*qccX zs*q|wOgYQf{leezS4Wy;q#WVxFVi#>P04hk+6S?92{d-$Vkey;vbd4Zg8O-*ZNp;X z&K3_aU{XrS%Bo$yPsCYTL44Iw^L0-VICQdt;E16qR3oyoPyJl3h7RbQo$7JOcgGlpLB8R#!XnHx=qrDd#>@VuSQPJUKveok)Z)sl)&$!!goF(vkT z#mLI*6`wKHxF#o_^nM-4I1hx+=hPv>4@UsPx*+)W1oZm(1@-3maHg;@gFv1D;y%!n zy>kT`qxG^ws>Cm?RzM)1vbrHYcQ%C{auvR=I`jtRG%lI^2&Qoc>uoWglC?ei6bzeFBX{I z_4hIdV1n~`Ien0yKW@`*|5kvZ;KGBSW#g~F9ASo`*7qhqF#Vs_5xPGj@>)T#5*lEc|E6z)+zU7@ zeX%x%UsLaoh6d8#>L(JIJijtY3)=GYnSm9M1qG_jtXXw$p0uX;C)s)bT=D!!J~+Wo zcZ+`I9+YZz{7TNd5(4&{li0jcn>L>MaT!-#d2;IHCBrh?56^n;?k%7v>MOSpS59f<69kyacpoKDA|yTsx7G-UKlY ze@jA%{7FbayPot(h8X~3{|&cE!I*-D*n(7q6BYW+)J}1fH%&tTNKE6g2N*rN*Pt5w zoxcaN_5P&5esv5#x&z?Z0J~i=_=euepIg0L5!fGp`3&-EUKl15X9&)8=|A8eU$Y=c z*a8x}{1f7y2}3AcXnIeR6=V6(V%&b(+4wNjisLUn<`Z)~MP zj9y2!KPb_R@6AD9*o-800ymjmHF%BJDA~C+GPO8vPmW=_S!egTR_pohiOeKMLl$TL z%V#Ng7_;(#q4eBT^i1kk&aJ^D!a0J&P`vDoc0@IdqAPl#-AZ<%%s(^1mUM?10B6W& z+-|$8t=~(Yg2nWGBB!Z*ONGD+QSkv}d^~6XAuCHy(9!7nH6^QLS{qKyhXT_kaDXxL zU9vJr@QO|5EGDokh~Bfor`#l;PUG&XlY{nO3`V;g+KscC`@!u_=slg8VbLaBGWvh?%Rhds86E|NUtTE$}?e(Z;j6%?!r0eJyArfchMT&}*BBEc>2-bjRcYXP0BQN9;$ zfCl%5y~QdyutY=3A(z~JDKN!0^MS1AZg^`&{?Wi;nG*HE;e#54!lI0TG&i5p9I-|a zsl$D}yDqz&{f7!JH)X!v2u#JBse=^6K+mXc37j1p8(pRQ5p9#IGeUy{>*N}$K>Yb? zga1;G)Vv;P#Z|9KO}{s)oQu2u6&ip!2211hpS?F!Nz<)^iWk)$a{~n=d#G}o2@7%a zK{Exq7`v$eiW|rl=EOxy;C=L5&Jm|~|4>h=d#5*DK?mWmmb+ZzG`7`~5r5lZlh_Ew z^JGAen-Vsp+pp@7DyD^R{p|oa9`{ApK^RFp=XsHJ@-x(AZJZY>cWmM-4HfB;;|~#B zrp&B#IK#-BIRlT{(nR}Ra_#*UbcQvTjDCnrS8!FS7o#pp+7ZaCA% zGgCVvyJZLCpVKJs0oo%XK5uMi#`O79xJbz8L`pbY#w#T;d}&PZ;i=C_pJd=$ zQ{wY~tldHTTIK)^Xz-&UpnDu41=)TO)K?C?i+=f{l2hSW$67ek?as||F}B`Shi@`12NtZ# zX%p88_?MOiQFn<%zgKc`w+4=^N%OmHO6J-jmeuO*F^-%xq~HTI7o!h`JduzKCgqE<^EW~k6tG)>ycb2Pl4~i z$__J(+BDtX%Hc=c@tRW*nmQ-E=!qH|-Iv;C@7qBtnXioV3GLoQ*wux7U`fQkf-@Wo zUZKZyuCOuoPH+^>Mo_Hyk!!^qKYEam@~Z+yMZCkSiXlQf&$zX)k?EwwD(6J5Na2(% zgR+rrjVc)r_XmQ?clzJ{^@PB{8^m{u6CPOUv*UDx3+?DTbtBGgdkL0^+^uEzZ~%8J zGd-hPHfG_@24>~t<4qtIy2$hs5dzL?P@DXqS)=v{AA>Nd^w^tE!(5krW{?HUv-`|b2o{6{18nN#s97#N%RY01d?3H+Qcy?#u zMwioy$Ex4R(oAqArc7XNZWYuhSkv)%G6inDsKT<$)ZcJpq+b_s2*yX0tH=nou;O8_u861Z@q|sYVI;b--p5tE4 z#^n$Yvjjgz^Bz4@I|Sys(u%*|GLRj8O8uF->Ilq-E0`;L&C;O5Sy8 zvesh~fm#jyywo{triy!AF=*0R@1YutrU{aRDpHh4L7URX1W0}QUl>r%@TGh7Z@Rrd0_icDk@I3mY~c00qA zMW_t9+674t;jh9w-7tTB-bcniR*@fL-%H}2*L~50l7k^NxD9inTOkB=m|t`F-@dtt zNw77Y$t@Rb5SWV(qm_T?}knr~4JN(B#2$R~`Mlua{ zvx`XrcGKkmRjAUgap_OhDdFnqV%A7 z3YO=F3BiyXr>X_sDz^TfLi({80w3v#I9iUFkhh7oD)thyty(^lR!bvI9F`;jBl$&+Nt`eOeZ!$&EdmH8 z_)9mXF{KI;S`T5V7b2n0TrUlLFyL|VytE}8rlV%(HTgiIm;>5Qel3Fe`K0Hi$)q{A zIe^3LvX9$|W!(+ao&=u!m?aY7kVs#4Y$3R5E=(PI=HAagXD%m2b5scCA`#UR%v9JyE6Bb%GzHk%740qg~6vSv0 z1@@^7-t{&JaoGBB<3Cq1(`x&U_7j6U7_Li(}5WhtO-RX~e z4d%ScWRyZTDt{6IC1f>?JNgS>ulQ$K;I`B;qV+GImX^i>&N%=Ly~geidYW_s3d~y6 zInVX<74=|;AOV!+QdQ|Yzlc2DOu(NUqHjrmzek}>e*)NwY%!40l=@_jTHq9576^|x z&3KUT(G#m>9$P8N1LsuO0z%&PftSyR=VI%hlpK4C*g~pjOJW|d|8*~35-MI)jI`j4 ztL4`WK^d8scSFBngVd~+5|-V5bF;xk8P>ytQ`L_1eXQ4R-6*A^=D`L)4sP;8Z(PDG z6<@M32K~42OXWa24WN9!JcxmT)J{Ni^Ye@UgRui1E#$}oY=3E5;!}{zSZ{nvl~`QE ziXJOU`g`~2a2VFzpof7Z7GmK^4l8-Nb_-o2Oya28gC(K_9 z95|98c}0ipG*DB&k>fgq zJ7{fM{(cj`cY#t-@Rmk&V#^+bY=VJ^o`b)=lT7gDpv9l{kH!BqBl%^%HYjxgnlwh3 zEx(up&;S3u{9io>SV*MNpeNCS2$o`@0jtN9e7^*jRGu&bQqQ$;n`6 zNdBsH&4dD)*9IRmWS6v{c|D4K=Lq8FTJQf^E3<3y3|ovD78#(L*hntfV1(+Hy3gHx z!v|;zS4&m%v|$5R#=LBGQ^3BzrvdYrqH;tzAS$A?U0R@AU`ueq5v(*cnQEo>AQwbO zq%43VN@G1m$1sy>2KD_V=D0;EA$ulDU?y%tUTEoF+1UGoDbbGz$1JcQ3&NhM2>O8) z^1fjNF>A;p@KjQ5Ilnf6jbmp#p5lm1%__EHdE}vxumUy_~+G z4!v#;ApU@;X<}>z96gaEVYMc_`MN~B?($+fI9Qcq5wS06A#l4I1fX}&Q6%A9O3~*& zBUqOd{Ro#b!b?<4UiM86!9lqGO*tcm$H9-LSiEVd*ixg>SA?ryb{s3sidTU|X6j`q zDFL|TD>MLV8@q&fcVFe4M_&8U1k{pL0iQKanEBErLbR~gB(9aNiq(6tLsUNmp>C`X zL27@CL(4#jAjl6gEXq>)W6@ywfF&R;vi^ku1+3Tj=vP?+Mj+#A2(a2rHHjM%~k>7k0?Hi*H5DuU`?g@BY=Wa+dJA23kzg9vzjkXQ?> z>$fzzFPE0z^(8TvjVJ>$UGZ7mJ$!FxkqFxfvG$}xi3ju{hZ0$1x8NOXb=29_jsRoH zYhB-)kt5OW%;e@rM5qC*h^H*5bwC@8-`tuh(L<+Re5v>%%GN6Txkm><{$KcU!pK8k zDRKYt!n!f=C+VEmCtH>W4T5trC4aD>Md~cIiI?E}aRPR3q}J`N=@H(~hX&s&wCPTO zsDN#krffI-8&?V|M`z8vCikdXK_yQ0lhFRF>wV`->rpNU4#q%D>d0BX{*WX3>==8e zr8%)Yp~j^$*T3v&Xz+2xaM>t&W@H&9Fr+jcf4fp#=Ir|TVv&LIw^vQT>EOB77Hc_H<7CH{ef^qu!M7etyXnI=Esw^M^^Z!J%=2f(hiF%g#}i@lNl?9g%vJUX#(d-L+u)7vJ|P^q#yM zT-EK3JZy^N9O{s-ados~ctvQfbGhn;hF;7S^ah{G?E{{VshSQO43@_$@e3mEYlT@= z@SS=%N;UX)7l)v2$ibYVYvo2>EdsSQo`^c*t3WE9n-x78@h9RGf>rXfS7gO%k>JJO zD+IxLkW%FzYQ+XiEl0G`a-_M{nsu4&nV>75eanxCM#heA_`A7nXPThMMl8QQR&#&L zcf=T)nb+yoFYI>w*)m44t-N5OxsqSUvp3#zc#j4joAka%#VBVy>Wh(NB}duVSL}tD zt`2+hM27+mM;{~9^1;foW%)er4uT&&zb){L)@zclzM@5IL9k!L&Fgv{&k3GPsojW10l zoNLEORdd2?1AVaa?Fs96ac`HM>td}kZg|-iU4?0^^fcG)ueT8l(<;O*-1AR$So5t( zSp=m=m&X&U6lDl)KKpL&%nWvk#4d~;!UdM+$hDo=IQWs&Hr;%d6`^v(jkKO&!XnoW zsHBnH8XPGyWF?xKMh4p3XeqUO+XS%kRat!a|otaT6Qh6-R?u_E4R3&RneLX2r z(Em3ijQt&Gj{v6PnhRREz&j#1bs#UtYx2%#<6pX@6n46p@C~roFM~*tsZyk^Oc{s- z9ey#8|Mt7{d8`)R)2LT@7-5NSi}%aKaeE^*Yy@aC)sXHDUl|HN_zLoK!2(mZ#?zBbQ#tP!9oxE1I^*FXti^(!k zf>~jU&iVLgd$sMlQwM#oTBLK6a#uN9US+4$I88{sqL|gdTcOE#feoV_@lqi^G5dIj zfmX43>oHL^CD*mH?O3^UzNtGmGWPYhWiQn69u-^~ayy{jDc4D~sf`V(pRXyD9z`D= z?75@2%Z}}$4!6y~*e~(kUK2Yw+7bB@X}$7oeJWRArRHGy0Ei=((JSaN(J&Z%CQ{8M z)pc%4wa6&#cxvE@L4bxwU!lhts|2+}agar|+uV$zv_kirb0>Lv@{9Oiqv5m2@NK|SwJ9p=RJB+pv)w+TfU0pqF1Q$V85lX6B}*ap)^9JP z7AtbPj`jqOA4GK2)nlW|9hxe3cxJZ`6BC`R3`5hwacS0U*@Z7O%ql)0>@AkmKD(#kr zYTQLN>6Et(&(QWbw0ux?eqE_RrdHkaY9U7IF*3?)MA_3JNW(mbmnTH1hmgARWiTK1 z9c&-xrW@ni10&0h@EVIZpqZz3f9cSb4aait5}461b1yC4KR05va$?1yr{}Tnh(&MUp>6MqQMdDa34f)i;lpPw-##PtsRhgVM|U+!UvA^mlwv0Z z3oJ91J&Vd+-m*}$o_^%IBGA*(o^%s555gWxHXanu#5O)q%zE7Ne}EnqKd9s$vRxK) zqR(by2iO9X7dW=6{)xC$L4TA?Vz^=*G=X)eKiO%73|%D+={x75KvRc*M#c?P>|EY##+ z<{FovZNBRaiU_qXL*y`a+(^1J*WPy6aYt12n9QkBj;JaSq1EWn@;E(ugOi`n{d*+U z@R$^1;@(IV(!oh&x--sRjEqpY9aeteXE_!RUqo;AwFR+EAx5L)>*o|uHe-V=IsA?h zRZoXWJOmc6cpp?^4$XKT9Z>OeVvp<`t*JRJSOs0vastqkR(OWkAMK3klOOXSUgKmQ zBw~$6f+)3rt~93ToIiDvAcq(7mMV>V5*r)BY7m;XfwAaWd%R7YWi5v_6&m_r3jQW+ zgHhM-JUcFbPZmG zFSX-y1J+qp?f5B=7%zSL2zV$*N==^`Jt_)DVeY^%Yvtj z$=FhqFq;5g3p+yVRLjmW`dc<9c_npWAx5ER*xQYqGW2p(*ytpi&bHC_8nrz5w_4A3 zCuDn+whg?+!@PH~qOEGV&^x%;b8(Fs6_K5%l*n|+U40wPYzg9gCK+Ndhn-OFiLSHX znh#w;?AEs8c998uY1cjBO3lo&;K9{AOjN(v`WIrE%2>;aUMZ!O%C>Eq;uKW|a7dk` zsYL(#9?~a|#MaBU$G#HIBii!p#qaU7De1Tzc~gv|dO5;5gpd3QEFHY@Ho>WeD!<{Mfc9LB z$I&HKCHEnUOgeMwg7%~jt}7t zudg8%!&h9>31f!ZYn|;z4T}WF1ZQ4dV1!SV%iI?)oOpFlFcm)oE9?2SBI>+MembZVMm*ix5F*-PFO&9nwN_fG58XXCJ_l`>`xZNqZPE z?|lDrF*D3IBJ7s$_M>HXiGH}bW7)Ms{5D|?xt4;HNB|=(+ zLtI9_9fAfN2`4}$Voy53EPung%;QLX@Y9poT3R#RZ0=J+9uKN)NT_uO&1Ib2p+_c$de^xGG}@pJs+1 zF3kEo+X@fGWt6EBkw|E2K(`L_6=DB4;`X&C%*yh|WhIFp1lx^&*sa4#C zn{BSS@dpauOO6BYy)E2ZlkZ|O-A3-Pw5qr2RQGollQ$V`9=*yt!c)j(5a>K&nmd#E zd#FMqRwxJyKkOo-Mjkm%%VS>lg=LI#`KNcq3ypLuvi=;YEMVvo_i|vi_)oL*bVx_! z-cxO(5ACnmn3D80xvjhKDud!T7CtxUzUt194OCFy#*I>2QJJSO z>5QbndcDzoKZftFI(u}*_4(?mX=yZLR#yjT++(b3b4 z9Le=Fw?$Xhr}O&0UF5gJsk|LdoDcHs+UWL%w-%y#j3O~JI4ES|<8>WpyA=Z>QWAZIy9e7G^kRh#501N6RBsx)8<9&#j)h0*xbc13WA8y?MSCvqT#e+^YBXKW zVfrp8;^}J`TI=iMLs55NUlugm?9#FBvDhEwdTEvO;Iy=Mv8h0%y`{Nk?|R?)0s1jX z@-5Ok^Y3`GF7t11QrN-h_7L?9XKzKlw)8_-_EcZ_nzrScG*Ce`abJs+Q8bew{V1pn zU+4^-+WbXklMVx~k~1-fA+7u|nTnbfk3t~iQ>yoQI^qOF|7?0%(fvydv5mO z6q{>e=3QVcbFQHMy8A)wIJmMPa5pWdr&sm8#)>#God9euA)05^;gLyk3A<{ZPZEJS z3uW_0714{ytR)fg;BDT(3AO#wg*w*Fefv|_-nRQY=~%py?DFu=G9pM?A&`@E#VUkT ze1aEIZtE>y4=m+No)!JIWd2w=X)>R`FvMvYAkYynR3U7uRHn-p+y|>APrjx~hy?IQQ$bF z-Um}B&pcGpTg)%u3#qn3h$trtyuSX>>^8#k(MW?i!LH-{g(&et(^{j2b8ZJ(o)PSd zeSNf%>Yqz4IiLb~eUuB+3fRH&iuDX9U!Li8WaTHUWOp~WC*;qvZFpolrcRw9iJloV zB)vJuvcN&^MQd&<<0|zewn97payG0QWA5wTL?-7_s2}3d3Iw4 z6^=`{8C%uo>EOqsvA3+p^KI^ynNUwvI(+=B!cZZak`sF~w`dEUL>=c)r%(TaCUf_Un6{BA1I3#*Wm2g)|Do zr7tF#VGZB4Jy%}(yqgSow)UL?lUXE2jl*W;ChbB%!l@9d5RMwJdZmxrnl0LX7d|C& z4-`soziD`~vv1F+-s6tUkhEcxTl87D^4@Kcs968V5VM6^XC>G$?h@~6xMj9h;`#5= zG<-G`2`1GgebQu7w)0^qZs$64rcw3bT3g-*3{x9M&mGHjJgOFz5%)I<)Mn8v=4{er z&kh~CnoPe=Xbe2)^d{1IdZlCSsE^uhyX_IX4xHRy3T{GU_f_;VY$>&;yY{bL{Q0%> zWfvcYWYZ0~_^?TrX6dOp;ng`jpu`X3z;09dH4MS7eljkDqb)1MYz-;}aN0%K>0K7P z3wEemPPL@G3b?cO^&6Oslz2Ksd=8=EOfqloY5Q4+$amfMl>NjidD_W6m#s5Vm=-ha zBjRIFc(c6zYOwbe@DtP~##Dt38U#<`MBQt>T6iz_=~9oJy>xu=j1szghdUxR;)#Y! z8kxbGrM7A>N+*fCIn&;iAMDvsGva;q6S3dD6esFLqh~+-pks_LCzg1hjF0!FgaWzr zvY50+vZ3Og{!{k(I`u*EuGgQg%Q70g+)&mUiEY06Kz*LdNzdKLhQ&Yj+19%)+*~pa zr7vP%ak4vv9Fc}3J~i&mmN(6wjMQmlhEq8cpRe|}E|b`h#C4R>EY>~cRrOgapb@`G z)l)3QEe!F|g5Q8+(#662vPn_7S93soz(By9W&P!DoePYKqMj|~egtIg=5d?w9S0m_cZVmzm17>jQ$Bem!J^39&t>x!w=gX=CsF$^wuW+Y!Cdd8|tF zUE1SSemSXUS@oW){m!yZi1ae;0pXn{*}0y#Np9#DjeH@83rWOHKALo%+5t2-&hX#4 z+mmTvj*V=*NJFk$nC?9(SVmDM6XLwn$uaRNfL0XqJM$)w*bNV?+gnfj;Wk_JTys^@ zarC_3*JcU|kobg69pi1Q@{#y{)#2~_sE_UAs+{K54FOSA2?9U3-QS8ZuRaKP>32!v zm{tJ;pP<4ePsf(&1)0DVKle9@bUniEBrQbNW-~lr_L%R-fHQjiuS~A1J-fR0P4&(3 zA(brNvA1UeX~c~-0n-JkFsGd+ z;td!(aujmpc5!I{6|wufh;lNj*2>l**~1b7#&p&p>@hfkf_hdz(orF<;^n(|v}KbG*u`?Y|cIdC!}cNgw@& z`c~<8=7p!X^tHqb8D5DCzQ-3@!2i7gzYRE5UdLso{&Utq6o$f?r^5>%k$LIAVMuJZ{Q!awEW% zEQW>VQ>oQhWygJGzgsbpD;xHTIFB$@>fYnCNIwp-sJRIWmj3)~K^&~A6xzQ}N$U$> z`_)oQEzKw^E2~LS9K@f(BgLfbanAkdjMKt-6}fR_BMx@!HoMeq8~<}0A;Y4)#&QWh z*skZ2Y&%*{f1%V(T195s6*w;$7dvMWVsYW>52UJPJV{B;Okuaw_xlv7-pV_VxSngg zH%@IYa2iWXA@NpV8Z7osU9s4rrnB99tun{x+m2aYy-KlaBC8K_j4=^%BR6@E?(1W` zceS@MVFOdt|FJ(gSM%T+6|H1AB2Se7c8h85Qo*A7DMqW&!j3)>`EaHZKUk#uhvkBM zPs+@OPVr``@`XHG4GDcxWl8dY_!i!&$p)3t=+N!n3!g+#Il_g;EoW`!Jq}P9*tW`y z3PAu{yYz73cEZ*{^`1ZX%bp%|^GUAgAN? zZM>=(v{gHm7J5%6ZzP`rtFAHha0($Ru8p@b)#jb`wylV^ncHEW>cvX?v*{T5a%(g_ z?ja9pEvtJ#(UZ7Ofb;RaXIRGgCAV4q{~_!xgW_PfWno-{yE`Po-95NN2oNkmg9mpf zfx#uXdvJHR!DWEp&cNXAdMEqc`+WCQednHk{Gp1Wo>{9`ukK!h#y;0{_F;#|f3k4S z7q;Y$n!l4MCca1VKfudA5p2<;XWQ=!jUj0^?D))}72$ch2G0nmzb5>&;*Ub(&JEwv zR9hvk1rd4jwOTGB$?v*MNjLb!n$_>Bh6n|94(siKmvW68({Xkvm?FW#O%9|UsUj+b z@H|@PBcUN{3e_eH>CLX&LL=@kZxno{`$ssajQA<^i+n@3#@8eoaj#K6Q$15kJoxWi z!Dg(sAxT0oX;{E%fhz#tr;X%77%qZxv{SI(uBc?WivU{GiIKn?9d`2VQ?gCrS0vOA zgx#47^HeW_;i-Njp7!bYcx>cLAwjO*w1#V3DN9XA7uu&LoxKzz8UniH%R2z%Ve%M) z!`qE`nhyK*>HzQjNHPSh`&S|m#mQr7~jyw z8HX+OZCRZgR5F_0b+!~8(~8h@zIgJIZ&_oyj%qf%VF}~>8i&sD_2%=NrzA|^%SFN+MGWAjXiPG^@=AgTM<2vnJ3)$G1* z98-bZ8$K^?$G4^r^&r6&aWTt*rKr0&b2~2=tBBSogN89rN-Fz1PX8TJHt}Q0 zadCqHcudnYO;oJ<)7rXuA%6>jW-K{k+{=z35QECnntI;Bh{~Kf(f2otZYWj-|`~22VyL%D-?5hwHS~GH$&0xj@T|6PLk|v;NR92j)h|MT<)jF` zO^S<*HImkGTu_m)s$8BPF; zTqMNg=rO&jix~Dta+PkdrwgXB8HI(QnnNqWh#;Sdfi~NgBH9<^teSZk#rKr)VBz|D5AFX7N1!oR}Y*zOBcNB|>w3t#C zF?6$+i0IjtOTs$YOFsBsn>m!2WsNvp?HDai zm_#x5awBhG-|9m23dXf@%VcjjqZgfy#e;OXTG>5U0o;PHnWJ!Wt2HR{YaE+ezpPh@ z7M1uod|RBEBUK_2bgToHV_cuN6l}Avx!EY!19?Jf-#yiBv910|jy~=GQx_UnXX_la zFma|xO37ayj=2GdxP z?}akE=_Pr}v`E{qvU-KYK1Fi0OkS#-?c}7U@cF|BIjnpr)bu0?HiXk6vhAK?_l9O0 zcbNz?jvI#lNynlFG!ZTFmjqlQbcwx--&lpDZ^l?LS9^@JRr1SBl<%L7EeESNDdNB< z6k;LAWcFDi3~G;p`BiuH5pbNwp5-P|x)AT3&4VNo3CM0uc4M0;ofuz_rP1PF^`Ecz zN+k?5zD&MI-#rx@>n)4D(=1jrU+D_;!SWBpi`RN6l3?g$sv5mL(xcP_0!sti2fN5! z2$WD{=k1p)a%?bFr4la?V(R3Di&GnUoL_KRjN1aUP4Q@%#G6ot#5{Ld5PXPOW3edo zYk9Dk8eT3QbuaHX$BD9Jt&=w6nh;U<4U+H~Q>hQ4&F@8HS1u{#lbq1Ez?iQ`*snE{ z8V~f@3Uf7Y{#<#Y>#fls_&xbtP0vuo37UGjyXYJbWEFZS$UMi{c2M5fbS1mFXvbHVYt2kaF0`@#VBmT{OY$P_ zn!7rRn$LHw7UlN>>F^U5iGWXXB{*R@`3vMpa!9Y}9U3-c71<-s>L|^SenUPWyOWi3 ztTXcKgmzHQT(WqA?g4Y7uJxjEPRWyq#ugZ*!z*w~P|QJgZNQjOHoaG;65^zf7((!eA$z4(lO7ix^B@DY+!m0@^T3?{0{`U$KJAjN_SW9VkfxqZRiGrHbg zDX;XNd!81;@Hlk38vJ2%&3iU3{{N(E{ocSHu*`E5V@}1aG~CpXmgM6NccMf zh2Xg$yjJ`Lcw|C%uWa|(TZEhSSGG86sIMk{L`s1*!3mo9<-{(+ObLsgR(!T+`g~QC zi!y(_j6}ng73Wtd68fW&>w@}bvp~;L0$o?k|2+QPZ9>-I_~`Zn_YWmM#zFT1%~8vZ zjaahg2|(_&gQpt%`NBnWMm9a&uo)ffyqI!u0++<6Sc76~83V>|>ywkf+{o1==dmHw z-}Cn*k9>wRTv6(4ukt17_`~tftY(C|;^sHmx)Z2fTTfzRxF0i1^!}ik#|AuOt5dif zw*qbmQ`nv_zP_VT_|-Zx$RN4Drq7d}6;HV<)Fmv!#9a~htN?r&0)MqqBGQ(CHvYW) zf2g5D*?*`ZVVFv-rT>9a-%GlL4(>-!oZ;43>kRTqWFojfG7}p*Tmim_A_VkR56`R! znSvp>tm{RG%^@YrhnkN_JqIO*qp>2c7VvC3ujg!s*r68|rnGS5%#kgaeAPT^J8mv7 zd;oQg7BmCrUb_KLD;Fi}SJEOr&vEjBx0ELoDLL8g0h7egz)f z`}*GV3V+!EaJh`l=2i^_aGaT%-#@B&Be??m(c`sfW8ZMJ#^UO6=u2Fre@-3h9MtI45^r=luXt;>g~JAK z*vXfAN&WbhL)=fo?UY{;=uZtrdapEi;!WjQJX!;$ZEt1hzaH=HbahytIIm&usHjQp z1i5-`cx_l6=?jenfUi79NMTq87n^K(++Od=MxgA0Uij?aZ^FjQNNTVwGN@cVGYjcy z@KdJs{GIS-TM&4G1QDQ2!Q+M__F}~)=3kcySXgV`MzLUB4BB;7%GDn&t#RHUt}mq) zMB3Wd(uUQhmV#xLKUr*H)hv8dN5Fr^Ef%17w^5CN?KqP%VB&9 z&O-bW^;8SrruB20Hi|6K7bK-+`F2;>Bp)l?y7w0l)b5SA?e9lmD|jiyBgKzybw~|U zzug*;yLWNm{`j$oI%lCuJ9Mp3$e`~=?3K~Wk$(xAPNMLRjls*=sp~07y>#5V(qmLJ z z$Aw5>_q4;lv|l5YCyrd5h~-|2LI{XoqfXQFsl|oBiQ{WI`Y1jsjQFr~{E%bdC2Zum zSRd+&if(B!HC~U|n;QzDUeY;0iac=00dDtaLuu~U<5FcWwBD;0^cs&pLe9GTF)K&x zdiB^#e0CDwGFhG6UG)DV=D@e_{z1%A0B-73?&{uR;Q0|f?x)V_0)XEtS77&rEH&zS zjUKO0nz_I|O&ePoNM3_Ha7+W8@j&1Q)0mc7epl zxJ_#}R6dw8TjeT81{c3$K<~!W8X^3!8O;Y4N@ie8LqbGMT`Uk21I76N@*9f$$Szwh z>mEl0feL<8za8$5r^_)fEm1sUS7!xqYG@V<{{3Yd&Q&h`yRT_C2Y=9^r##Tb?$7aD z_7@piS7#vuIEeW4gK>diE2ub4;&LtDH1yNZ?>L9ok#4W|@>!SK%mrUq!P9nY`QCgh zu$S&ZRm3+)CsPCIY-%KaiG7ArI~C=J%$V!7$Kt?LQRW)+hud<8Cq_vpJ9ry4d*jFk zQ}GX*UXvi+6}U2#c!!zejFK+|La9om#pk>4)~ll(6@Q)Hi3c6-;85$W3_K>qaB74j zT8j_^&%4(j z(r9%QkY8&;s_XhT-05$2fL%p?)9T1-`Ob)JQHdA)n2Mk~*@>Jf8d5s>9ApED{_=dM z7FVLqyB(mqmxxg8UeTQ;)29&&AyUs}_!P6JBH))fu}>>crXDUDIhb-xvCsy_!navL z2l#4|HH}4;5ji>ynm9-1MH&5c__-3tu+z{9Af>FW(ik7RY<;;7@Pp~#nnwP5Z4ej=QYSw`AJ!w<6NP+|=mhub z!L4ILO{BTC(a{{p2Ps=w*C3KNoD3T`!r5&vWq6_02FvSu&w8U)VriRcaw7`TMly53 zCoROK(}4ei_$?x#Stq7=?@AO0HV|qGtPB9_p-<(?&;@~E-VpPVLiQ{w z2T@MB1|o2kvwO^vYH)}yM<-o?I6y0*~G|;U-6BGNstBATg|$ zM$;Oxf|u-lIj!nKR9ufkO#H1`g^SVSgF9fqrJwFi1k5m80hpN)o<2Y=Ddhu`P$5Rk z9%A^QSwMyp+sR(&9*w%lyZrb0ZO!>q&~2aJJpz3h<)%(BjDemB7oeZQlzD#dYJaB2 z{nUuZdN#~eAtJ>6jPu1+PTWj0x*axbxB%7tYFoqZ>SUD+sw&WkeIlNhHCWS@bTXhc z6m44g)D6Xz<*zkJqOf-7MOApaVqST<0pN5Uc9L7CQe{u#OP|DzJ7fdF1}bgur;wR8 zoiAiPPL92wiRAgBH#K}UejBOv&L`>eKC!NR$#D5YDNfc@6?sVdA`%w8Tq=r;EnC(5 z6Of(jkzgd?|2-kF_TJisx6;_ApyApp_Bq-vEf_tti8Q%xR$6syI|kpXj!p@bR<2?& zCb8^lDlQ8kyp_CYc|NQ$S+IGlo)8T#rKpaFxIb^9&YtjcNufL9Ry}p_mx}Yl=-k&y zmoeq*=IfwNE0IAi{y59k1j8)wOS3gT^UYltuw}2spk;PZE3=uc3|5j31GEaZ+@ykT z@evk{55h;4Z`+WRmah_-c}dcQtuN4vi`zhzsLU28LeopL`rl~MF&jPJ+u^P;dB0Z6+V&9QLgrdT+E4L@$94&RxGlmEYa;_tT`aKq4J z2?K{!U6hH%DIX9vtyDW>$P4i{RbyI-DE{z)J6k0-Qc~7dKv>1~G(U9|Chpj;!{j)j zi{3Y}E+gyf;-#h!-F{oTiB&1xyv6mopt+mPKi_QHC`s+?=KJ1(=HTif=UGR6o|R5L~_vPyr|Kao9Wq$fA%p}@hOE6ir)7~M{ zl%lUWV3kn7s6ozOD^wTL#q61d<>B7yBhA}nF*G0539r#PU8@qN%#@AK8RJfDxgp2e zs97EHAxvq_5>9|!2VC_>68@Myul+k!xU&{u3G4ZiFa*P^J1!&Ng)SqfOqgBL{bb~r zs-zFTw?7aW+V=w`U1r|Bm3K>sU5b3av56*vMsM8iLPE!+GUyaEGVdiO6Pp2BN5sYr z^UyDH0&3VOQ^H4D*1vBT66LkZjfqZ2r+DZUT@mIqC?-B9%3c45j9 zGGce!xiA4mmYJ>SUhB(`6ipscQ0{I} zQeM!|{Xx@VciXSTXaD7-`!*GoBN8Yr-p|CZZ)+4l9K(hDl zw^0pCX%c%d;^-%m6qbf;hPqxsucOe({NeO(>k=rNHZ7Rh zQ$6Y95~}lWS6f{18`wTN|f3i7KwvSV^G4XzFy8c*izq*t?}xLfeG-4 zSb5WQF#HoBA7aOx&s7OeP=wn3gp#gK9kJ+7dr0QAlTd5`6T4O^I};TgH{+wT0Y)lO z2-Z6LYp1ZW22groNH@nRBwql!)hnkQPT}#Bs9Jhvnp2{@fswx{v=J1P^q<;?cX7S_ zLhr$=JITEjvb`4`1Qrmg1_`ht4uh~&vh$!~GOV{}?$mnm2bh$in2y~>?fCL?4#Kv} z9Yl46EW8qp0^1cG?H$$5ldxlk?c}&+EUpn1(Pj>73(q^ZoC__t0fCJzy70Q)yCbgY zZ>W8V<;9ef&tZC}Z4hQ1I?w7INk*f*_4>K#*}ht_YyGT`oKyXB@J7kl*z%X#&wA+E zntYgPGUJCpJUccVdWzpm$<3{V6GzXflQNBbQd$q(w;l2w;wqO@sbq)N@3B7YTy2xg zqETh{&s7g3u^B81CU~6BAKP|;aV)NR?8YsMU;IL_A6YOpxIz0Tt6kJ=CtkeomwAZf znm(w+qzk26FnDG&I%M^u6HLN5z4G{ZG@9Btuh9Y324I6c_Jg7Mfyl%8QFN`=Ke*bh zvZm@;A?(JBb!L2|0B<3+%A?=C3rA+k#w*@u97)_+?QHpN1oULXS5!~}A?)Wr8Ygx# z_wH{{nUf6b;emW(SdTV}KM@CfXLD`b~)lSaqJ7mHTFVG!wLZKzB zVu2>R1_6mSyzCN#_eQBGe0Na;%Z(L^<4{ z>N|^6k?9`jG~a80Rn$r~LM{>Xy*=6q%49zdy8;5j5r*m3EDXWRMHpptf?=R>?d;>w z8BI&iCHL~Wr=KMb?XEX!XDU=*@|qIv&A+VoI#jl@%~Bk#FZv#;H}{~+XEwx^__Bz% zNpEyG>{E#}Z5Ew6%{cY-Ptsh6IVV|}Fy@`T>mgY#Xi8D_WPs9qtETzTeCW3F2;0~p{f^vl;krDr=)XR_rH@rGBrxpw%Z$F z=8Z;Ho%?kTf9Ga-YWQR=$+)|YdgyfR<$+3^4OZBbM>keW>nB6STc}d-oVfmyG}%H$ zN(hP+;lH452=CjoPuTe?9@E}sqyIoT)H%9d;9})N_kiDmVWl5U<3FysBJ4NCyZ`#^ z^hxtpYenCQ4V48#SN#3jE9@|Kw_)?^;~EMEw-DovYQI<7u*uFT^h+J~b&BBcBMgLN zi_OT!*4>Kt)onWGdqt8bZ}`BQ z7kYy^+U`v|xgybV4PG(mwgD1pdJoC#=t61IA)bs`?BnHrYfvrcLq zcIcYTZ#$>vnJyvHqf#B|ykFN4|Y;@&3X>6#7 zxM7xE?ctJdDTE?v@)=3LJ$)4JoHX;Ul*u_CLyS<#?iUrcJCTB4VK8d@(zznw0Zs{A zn-7v!{^hZjv3Ni0;o_a$CnJ=9i3skumNh_^Cpt5(Fd@;B_GGkq%WhQj^E%BYhi+2x zzXqy*iOT-N9R4FJi#8kWcGCV&yZS3A`%CR_cP*ir^89fH1`bC}L-*RsZ{**a_67;s z2;(?hFq$gs!mU01+OgPE89ID1Tr*Hf5+T8LM!LQ(n^%b(?szQRW&Gx|Mt^NlpES#= zaEFb3XPjxH98*Spzlo+;V1VKv!Ll8dTL+oQSKi+mN@kA07F z$A!~%z65@#G5Ha<#7i0 zH20G66O1HE!Yt!{f3Dt9oYLkVHOjLCGrsgCEk+LF4Ox|#nYbkkTU{Ugk@)<|v%5Q1 zGGkfewkrEl(G`8s4x2Pc&i`Z))xCEt?Dlp2CAj%RanMS}NIHMO=U-pOn?BY!&Afhv zY=H-%F&7OrH{d*U!q#8^gfXiv6`{cGvKEunh%$*LC$8T`PD2 zF}^BO{HNk-=RgDcAd0NGL~Hv_ER8W@C*gYUIcuc5nK0aU`1g%`)c2njFg_k#6gu*b z^3p_`m8cl(@1HiGDr(WSxRkl8kTEr~_fFgglgOVk?v?x?OcEqXt@!K0~UmAdoTnD#$rApj_Q3ByYq1rle6M{|r!xWvMEWQ#0(Jq}%28 zy23O%(L`qEJK|Mu`jW-+Tb(ur12-n4H6yx4y?vnt^QYQp;LvpXx6ftts|Hwe!Q{fG zylj`xa!a$FD7${S2o+qMCHiI7jq_fx4Q2eNgVDZ&`^C~@`U4QF1K#284Z@QJ6#d|jqVsCsy<>F#tuJ4t6`s5Lv=*0H{PVr`4wvOHm$^v%^`g>N+F3Ox8q z`&QmB<^4m-U$t>1bR{Bwcw6RpsmN2?K4IAElgjob8?{-L-Iq`2I}77UvqoD2 z?33;w$}V{5@`UQ%S=e_JLhi#1fK5Q;_2m)4L}mTL>3 z4SDjePR%iPiS0yxQr#ze@f&kBc&_}HGvOotKZ?dos7~U4Q#8JsC>i7k^PP4>wNO!g zb*{;CZ$Fk}^OY|Jq~cl9QX9h@@K<5^&xPV{v^$Q=vZS>QNz|rcSmzOoi^P!dAwd_L zoZY%#Y9%ZK#}=I|r1#MZZ6EuMy6!gPhQ*;dkEH!g|{&4B;gsFo|^;2OmF# zD9axy@qHpUwO(b7b~_CZRB*U0aKrQ~u?u6q5;6a4@X~LcCZwjSujCFVjV){K&6`9x}!V;<~2H^?{Bj zTnD&BzQJoQOt4_arnu*b^33FH$@BL>`yXT(v7jt|p!IyIMfY9sf#)1u7FxXdFZLSX z`6dSUcehE>pxN|xRos-%LnT}gei(k|O;Scx&lxZ~4xv*bZ|r(VGr-(;GH z|AsHdF6)6+75AqXlbTiK@t900Sqh8el;9|D-`xH>-q;$4od*+om3vbZwZL6%{W<1e z1=4DS9}|xs8VUL*ASL8#M^5uD+lMlKUqVde`Ag>+XUN-I!x`05oLExs7lq}cQAeK6qSCl5(XrT zJx~myCOcawg&L{-k-iyw!QpxlF`;CfFpSyR7NhRY1iZWC-*|Kv&XpP#!Fe1CZ}wk* z^1;Wjes=* zlX>4K^4c$u<>u;6PUe~xPtX@&nB_@x-8~dY4AUZFs1p)77jiM&e@-UGvzUYqNq?4=iqZ@HamjXW8u{g zwQw`}!AbtWrz4FvYnj%t%QQZ-yowJR=#fcY6L_AOxO%K?c7Pd5&ab6_UbfNB@UqVa zAj?VP04xLK72)ImuAgWKZgS%PU48an z?ylFHB$)b{3<^=xr#DC7BQVcX&PtI@I!dfa`6~xLn+7plvhO{OC%1TjDwX)^Y&353 zUv&q)N)o>D%`+~)hjaaxnjoZph3BJSZ>%&vW-g&}SIC@v_PggUU#}V`5yWAv5wEoC zUfF1JweMblc>oqBg;3B$`}r2s`0^3i5%?^)7Gm|M7BzFEg@$c&vu|)Jk?1a?n#$Om|?jXPECg?6u4gDZ+}WTYt6@lE+On25ZEpnWD1-~kyV_1 z+6EipuuxkMF8L&yq_o~1BDLv2pFr1Ghd$;K_V~i#5U?MKO2yW1Bcv_GPOp>Su%Io< zq*fqQeNCX7$mop&Z<@a}w|(S({iK0_oW;0;$rZicAGxQR#fTpNP-OagDu<;MWeU47 z&F_f3-bpgj`Hc8_BtxR~+#0yD6^SV_p+NGLW~e(22p3y1x|87TQO>>F-mY~A>M;<> zJE?K)8Z;Dd^3Mc3-_U-!8O=DYaq(X<6@B+ONGP8)zjwWix!G~bM2x*$;i^r)iD3;p zyahCDb>lZ;khh7`$k4}J1{}g_G!LkAJN@EDx}M)8g&aB$HFy~u;_6yi?!jvtTwiy6WQK2{iXITX6g%NCP# zJ+gth?x3&^-7ZJ%NATe&8UWEV5A*bn&RUL{G>g?`}+pK00j%R z2(=vFXckDnV`7pWE_w3uG7-iVc5V@64AFKFi2EUcjdS6GM8%{BaR@S$B)FxXcU{?m zJO*AYPvuVI6i|in{gDtnb9aB9aVp$RmsP<0FN4xk@W0t3od4Y+ckbX|Xz_Hi)e=z? z#i^&mW++)YTQb#p=qqYYpGRI{sXpbIs3353JN&zCO-WJKmKH~JDKvFLq;ABL)~ z`)=2-JP zVcc7YZ0Ow-+-|jj)0)EfTZP?Sh{I(3P-livhC4T!8qD|3_7;^#0^;Re8-Q`!Q0kMh z|79J(G5H1cEb*kU1}GOYMo}P<{kq@m*}f{p#_4 z&JeG9?X*~uaSOT>0={uH>VVzF!NtYW6Ntn@AlX zlqW1c?gpWwO7rKfKN@P#8{er|zz%Cdbb4oEa4$L^O`P%BvHfPAlQqye?B_ni$4_#8 zdbl($+QiR2Ozv2wq1L~%`Rzy?^MIGITRvdv&?uca?Row@B^X&H)=jpYLA{Yqe%dj+ z5?~3?%It7q7Q43!hn-7zr{tv8+vp*0O|fKI>HnOpb4R($4j)$-gImVRISc2{5bwQH z^z0(){-MaFgjV7rg5vPQn`-5Ff$v}zBVQDG`}KmLz@5s>9f89ps$RKgMzW@h*vU4z z05jNKxH-d!qc;0xNuRM!%h43lU@`BxE35q_Q4dJEqeqFe-FCd&@oDraTKsBmUSsb6 zWhaIjkqDr=2dD?hhzYtVwAP)=gbQ2+L7i|(fjg!D@evE4(vLR0R|>~}r%a*mt@i4v z)E)Uj($m}f^>V_$E;I<-g|lnyp}Q51>+9~odK z`9q8pIK(iT92qb)txY6v|EGn3%jxHZAut?Go|4xCpX>W4PpH|G8r?>F;D?OlMR2}H zRHmL5tqBD}Ns6B-HVy{Es4KN;{V-$h1)zx=ZP!$eE_xaob6 z|C$6>gBZ__U^Jj_94V>U8uNNPMol}mwe_|KqDcO)2*2Fp4lbTkeG-eu-5uW{4-KN5~AmS zSCXVn6KxJCo7l+4trA@gxQ^+-khlL_hEH$5)Ve~?^J@HBf+Y}77i{Jk9dL%_pAn+% zV!&=1(gfq~NH2#pTw5h@h#)lD!b)=4={QLtK2Dlx93x$HRWjx|;&<{HpT12=!A|a9 zI{SaA?w|m3Bm3VYW7PjVGJZ#s$3#W#f!-9@&skQhp2lTR{8jDFgQkxbpHCex?T&MM zh6?dfodWVXi4eAYU+IK}44VkF{ zW{$shhqFUL`R9(@SB#h^!r!(k538fK3P}@Vd-;DH3#uLb37=$(qY*^K+i*F;2+8x!L1PphAr|kN+w1I{9N%S2(SzW}GbF z|u-<`Iv0jpSRFN5c$n>H2NgX$)o3bp#GywChDQZ9G#kZe&8z@>or41`Lm! zpz0B8rK=Ywo>p9hCX<4z33#R!QKSm8Uwtw`c7X%ieqM_oWV>FUKeV6Q!Z({#pH9KD zaOkelVca#7HJ*w_&wCtY9b4zH9^#r6EU(d(|nF;Pr(vjS%B;PE(%a(_00yMq&bh~5-c;jq8$W*M=RaC_+sK-8`I9|#9 zce}6szhe>*3;*9S3D8Z=;kkpILcw+KBDzX}>wW2tM!^T1u`HhD0B};KejfGq5B`eP z5OJi}z{B!ndi$;Jh`_hH_0*(|wqav>++<&l4yppc$LURUyqW2m+>63BO)WUg_O1Y6 zl4*gt!FLfM#q29+HOrSf`%3M(Q|gK^EKQ@{XdVH?q4w+vVj$)*Ls&a`4u@j7twjE4 zGCuo(Jekp`;3oUbgP}~}Vn+le{2M4)4b8`py0O*c@W!m#JQfCZ8lvMfe;)c^%q0K4 z>*sF1x7Ta&AXJ2GAr#rC%T%XZBPG9oXhU09h`Iosbq~I27ug3;0Uex34dX}P0!Zr}h+7?4Pe>zi9wz=5&hwgwnG#HF< zcd|NBZtO#ujg?}t#(m|POOjN+`a@&plT3ga&hg-;Gls}Zrvp>i=v_iKqv_=4%-zB9 z&CqzW4eao`LOn0tZ}aSRCU)tm2JH5GTe>m*n#|)0aE*`&TYh=dsbF*s!f+Rk?(4;1 zHV*b8ufX(NjTRB-K`0TY=bG13w;Qt0bQEZL5ktOmVOLsfR6siI%nZXbt9un{NFnC- z)&3=Mq4H93??^mpwNF@1e7i{}S=*er^8{gA4mURy=UDyqd2}5kJ*q(;(_~_5ZI{#i zIiGsbP|;fWzdxPVf`8<|HXr_XIq;LdKH0o&KNyq{m&L1(c)@t95RJ%9v;l=^zct&? zE(pd#^FxVW?;br0F4WAo%H18-qmI)X^9>T6fTfuudN=FUxud>bw{d9T+Zgo^zj4*05s4TgcM%yKu01X3uTujZ_q3<7A2o+ zVg$%BBPxOHy*5keZ29GL^Qa0Oj3W**a_Cgtk*PJ^`X;Z4#Sy)7QU3jYu~5#Z@lyFT zo?$vkr?;7RhjVuQK5$NfI2d~KdU-VoQ=8>V%}^kFt*DNeQT+G)jJHK^=mF*2nLhY6 zs8ohvU(P!w7G#Jnu=;j4(C5?$zp(Hoc)OOx^h8#y)v%M( zW0AC>zz?#Rt%5HYYnO!Hqizkzxwpo4k@5udX0UIiy9-gfmV$ApTci`h_pU!ahZPT< zyHg1e6p{|tp5nBYcTw`-1xog$bf+A5MKmV)Vr4f6I3P{-@08IHtHH1-okgYZK7HaM z^EQTLt7ej6Bqsups&LF2hghqc0)CBLI3|dq4lLS4kx(A z^7SBu^V2r|VB$sSO;@Syw}iSG0*)ri79mvv;ma;o!5cR1o5K(aaVPQy(pjmU&xxDK zOOwr!hSV*lx6yx73Is6%hy5pB@Dk)y_#XM72?X!73ZI%(Ip?DDsqZDN5PKEzf3x7q ziDtV-3A@?;K-*Cyd+Qw(y$*FUb2a=FcDD|OUhb*+d1eRv7&EZTx(qt}13^iC=J0(m z*aj0IJ6u{|#4@@Hd{1(-9LtEi#e`vK$+j+SDB3lU~3pL2*5BDwqxrU-e~n4^_9uiM3UH`A2Lki$7EQJ^zakkgL<1o8nH*{qT!P9b&Zn z(e(i@8A>)&Uv^HrMty`ZK4`l0T{2vG~`# z3MP=<(_%nLdCdxLlr=uTy_F-HajQ>ew=ira@_K*UdNfn>M9sKY<*h9|Gi6s^xQ`Ba`>b-R|(cr+gp6n+CzHMTc?FETgBG z+k9^J5=Ah65`urk%IpCLR<)Qpjk+Ta+sA2;{mZ8O|2?#ze@~A8erW%%Y?Z$)&i1-S zdXN^ADXzDz>6nj$!!j#l$XGzU(y-$vz+V~4pSu|GTL3oR1d2US0*f(+>4-RlEwJ{$)Mk`{Hb5 zJo(2w)#HOA%FAJ`VouMCH+Oe;8>yB;J1Dpg5v^G{0?O1WhIOqRV`w7 z5-r5Pv7n{LrYJK(SOX-G9ij0S%RPOrIrQ~dXQfr@FOd<9O z&5cR~AO;1`CrA6BT0au7+3L2T3jtPkJ-*C^g$&e$cg|?T+%+gPQAs-TiV`nQ5y&+F zcXm%^r}uu7?|suFLKJF=jW$cjGD2T+6_V*>kCtwuJ=T*Z<6!zKS>Cr!TWAT?a2@S* zVOF9_u3ER5(4uyV^U)h`39mfWVs>;;VzqQ8a7A0AEX^EXwb~k9_6=#El*>0XYQk7(6v1zaJ;%Pmp22x! zl_efYkfsdhS$ZJj$Y<+!N)+I(lb0XN$yHz0D}9I3=$Jpdd<%S%xgc{tfQI~?2vr7w zkrNG|es!Q)ip7`OCmt4-$XM!J#~%dmQ>YBv>UnEGTXTtC{kYdqW4*Z0tbYBskm^8!szacaV~4D2622Iyll)ye<9&Ii*Cf+zg3#QC{z2x4RaF_$W2V(i`nH1>a$f# zYomlB{*bP0EE;#sj5d$N?|iv{&9za|v5QNG2NwykIrF3)K8oQeFP~u9KT5=*4cB*2 zSEz*uKhrvnl)uC`N+3X6rPY5FAPb0E)?=huHR~`fDK_9xv)GS7QG+Wb{@-Agt77``L{v!ZVK%yg_Tx)hObaoFFAfG`d@Ov|DdpL@qe32NTIU7 z|JjDk!ai}iJKhIPWf{i+gT-p5h(L z;vyrOFqG)9xQ-Pzwm({_#T$Jw_{kzzoALm}HDw)jm3^%ITstzWH1Uf8uM`@W(g4j3 z`x`uU4#bVi>}5javYd?U9YCj(lfa~prgJrc?if7)_RF2<5Rc<0SN1ib3rYqh0#oG% zDw+$0qebU{>TBpY$Rv}s(h{4Xv4@-A>A3v~-hobhzQHiOTCidL`C$;3vm-ceb}M5t zou8k}d<0v=f95)4@Se@TjUnjq?dqS+hIrpNW=l-qcS{aD>V9+I)C9U0`?y2lNDc{@ z%f6&qsAPHYvd?coJ9mHf6@FlZusq43bUsYvVt6wmCr0N)*ONkp7`MIBovuayP_P4m;61=d%`lX z`7+6Y8@t^o@r@ zpHMtkTuTWqQ*m>+8a*i~DcuD2kn5+DLBdu)!at{QfrUk1aDlwC?v9FNJv}`cUwsp5 z5;kLLgdlVfT2QGI_g|0D5;jp{zd5jg3U02Dy+m`h3)+t%;AlU2GAC0y#@I;;I}~|B z6|JI3Kf90aSJIUldM0_H!b>PN8;G@xR|4_NI3n`j> zXX5+@rC5eOQL>dzx*kLR_0j!}A8=O(DC`iBkVtP5SoCW5+~7Hnfq9$#qmn7q_39HLzT2+}*g^u0mVm|-(#{Pj-I*%Th=FGHd9-(da873%ai zIE;tuV8QqmoiIkK;y) zNRsD{C@KG{WO-sfs4g%E1I=B-rGo%HMl{~m( zHudN1Db$&0kmC9Z;-GNgUe^<_(TszOlML5bKhg?0*bO}Y%PWqRV2oCX*}ZS+ybAJM z$Nj<7fBM`s0{j#@Sm%5KJr$g-bF(xD(5Lj6s!Vz>=+pIy$ksx^DH~(0V10g_!PXB4 z2VecyHbTFGND@APcWFMfF_8W>Z~+xvsO6+B`%NcWu>De?pgO4F!~S_TX5@n+&FqNL z&AB<^+)*Mx|AggQIztLxQT@dIyZW-=3kp-S%f4J|k^+C^Qs%G}RefkPPOc zuS?ANJu|HGza|M(9u&==DJ2RLo*r%jmiZni!?9guQw!2*g|5;6WVcoXUtnzs+xIz_ z2C(WPNN+U-OjOex_HVZ-O#e-N$p5e3>Q7VP?@8!h>ieqTO**M4 z*55CIrysFx72A1#2Ns$WV43}|(;%B-em#CZ5TQe-LH>88@&AeS|8LGz!=JFJ1rPH< zTxL`f_0SdisiM1@z+kQ$!?I$wR}k=tWwpt=cD4nb)~HG1-v4)S`A@vr z1$$)8S^wP8IEVIlGpeWcpZ@ja%6RT?4|w#;1OBUH5;NZzI6lw?v5a=a!~OrK7yWbI zPRff;RbchqR+#HDv^v6$<|W2S+2A3&s7~9A^Js;N=qq#oRiqHS_#JM_{x@fBK;h>9 zfE!Su4Y65(st5P3>doI;-@i|j{P%nc2>GEQ2OYN12?7=RG{+q`8D*bKmH&GM^yI;2 z0_D}v^F(w+xz*J9cY8gXr*w35un+f3S5sQgI}d1e0H^D*wp5uk9mOvdlptV5|JRpi zd3mA-jzY1*jvMu3W2c45t1fTf>{mH%o-bKk#=??3iQqPTq&h@>TBXt{eX>2IQh%WwK!+R$0tzQ&S?!I!DoA#=44K#sq}iN0wxZcFY~Vs z3hU&>^+si$JRu-rguMZm0*W7OYbk{fB_9DWzN|?-&|;Qf_x-rZE{9gp zZP|&S&Q~kDJhumSma~wR!W-Hn>{|A3AMGBrQqI{ zXc1tpgLpiZ(eOZ$qR-Sam~Bls$L{hGjB-a9Jx2m%tp~Z~liyIVMONQ@c2)&Ul{G65 z*q)Sw4TrR1%WYM@@+l7>vd?BWhYrD^LPN4@T}ppQLnwbO{h1r2u&O3Z4+RraUiE+G z1~?}1q2tEIj zj{c>{`C~;B$<*BSST6{Spb~@PyRw)#!uS zb8v-O54aMHm8wPVa6sQhNwdQdHRiQnU;h9@QD4u=UN)^nW%R4v&xjr#Wf}4{= zL)5tJe|v+Dv9*m)lPcfEen8xPFHA)jQO$I;a+d$3=Bab_i4DF%XJXGBZ#VJtE1$CK z1Bo-cznaR9E2v(JPCmZ{PHd7cXLC;if%&T(Iy%D9(s#NgZ)zr*H6=k~z$i)TN~aF4 zgr5xutx_K4aH08Pq|>!g<9`Du7AY^KT8cLup44xZ#Evg-R_`iCxlqM!Wa9{i(1?2k zKa#=3#H0uC7zU@8j@ok_Rz$hRTyMI(Oc#`2)}E#Asli|YJ4O@6 z>+sM~bZ>ZS0{t~8WZoK_ZUH3=ihY#TJn>!vgVJCQTe{BUY-_2E^#UAHn!Yb9$jY|-)etTX%+0HR7uHY zs+gP&R)Oo;v|_g$?$0Ewa?8p3;}GxZL;ok3r&}lqb#d8%+--D0uJZx~9+wCFcG8Bqnk$-1z%S=maT*k6(SJL(Gx4-`U0Wv3uc^dZT?N!0R6ocOPUzZ!m+B6l!gI?S3!@1=>w`T9dVg&ei$jN}H zA7AVUjX@^(K_>Nw?krapgSJHU$-tU{te17wb8Ca^W98ONTh|8fYI#>gF46C~uwJJ; z5_h$_WTUoDmIyvl+VVOq~V1%mQ zu}(DP7(>V6bV7T<%@fb$V?@XJK*B`k=_j>L`UVcf_wTVeg3f2z-+xw~9#D0PmZpoj zy9{Yelq*ivo<2V_8?HL2do`pju#-r)p^{uV5JKaqEl>JB(j6B6?XB)jp3B_y^~`mS z8ZBL;_7oFTIu)%UWIUY~6O{v0L+f`1W-{`t`mjDMRVMb^T-*72Xn@2#jib}O%Ik}E z@WT_Ca5|(@^YdK)tb81?DUJ{Y7aeX~oVQ*q-MU!5E?@%_AIe=tY&eba4&BJ1+E2bX zt@B(x-%h?5$Q|FxIBlZB|JvZ~|5fMWMAfpR?d68+6jEgyV}I4tZt5gKb{uIO`<3Hjo948yZPWAOROMzN%)_1` z!Z4Boes|l%y_fd&E`4Lb{Gic38AAjUwG5-Y$WGc8Elhb&d{tW zID_msK#+?LYr`|(-qIz9jnSXXkCUdw>~{UfKQ?%eZ@1ELPz)zs^RLHdNt`|}Ztm9F z{&vmLq>JwG{IF(fDth~1%&O`~fU{me zzDuOMO>}!VQ-gmInVoRo{?&K3^TP^N?UA+6iwBMZ=8KyZJe^6?q2k-_olftrZLReX z2wuC2?6SHTqZ%olv^Z}T78)KpH#^(9P=6Ie~)wuXspLoTExpkRDlNs~;h`Swn z&q33i=%#bnDrl;J`(+a-_|4;1Cu^!+e|lP`s_oo-wxU|M!!pDbPDlF04v zy^~$ogZ$b%QvxQb#7ZJPg=`F_9BuQS$W~7~O*Wh5ZaV*a?@O{|riu#bm3c*Q@dMxN z&Jj|z#?4!5QqsDg>}@+-KfpT(Ta3#=o{NY(mTkYSiR|cESif`~*-f*|5cBA&UDwHsGsPs4 z5iN%i!}E_Svpk=DA7~?2$i=LI_ue7#gITISC@s2TcjLC7t&kc0V`g#Oq#QO5?)tY^ z7TGcrBT8?B!ILenD#6!IjG^H)ZKgRy0K*fPNFBn>hcQL_k^o8Ks7+KxuLJQknlYAl>mu<$H9Rtwn7!=OuCLER`u zb7=>ZBs=<|$BKB3$wIh~DaaHNj?Y~OSY@>hN?52n=9Zlo^U_3zA3xApAPa^CV{F7U z#VzoZMqp;F2g*9v@U9Tu!MGdVF#LElvpU#xr_gX(c!wTCq@JFcDz}o;Kgqg+&YwnwQmp?J9=1^_*adcz;&Aq}jWUv8>WfV1hU< zV)O;(!NLf*y~DD4TgLJHc&W-83%`;{KISXP*sk=>iyiOrhH^DL>GyJ7fn#!kDeO_+$phJ{u5YQZRzXdv32tFlVf$!9rr>?NfmN~v zTZt&8B)12iM&03EX#4V($Sn@-HEoY$Lwq{h`w8OAkK?8VvqXAEVyx)CH~(+whkT3&gpJ-r(oo0A0(cARu>6$MucZY+S)F5SbvY@a0FDs z9u6W!(%)qa{0grj2L4(>s(LK%Ligt&Kio#ivRm_{{12&5WP{HnbC+ofx$9Hk|A@cX z!oQdnAehU6arQ&&&Ku;vrj?#q-Or>T)U7Y&r&ebEUY{5}G*-!DC$cuAr?`XW$6nYY zk?;n`roBE9L_L3q_5H;xcJg&SCm(A3q~MC-^M^Eng-b_n)CQ!~DOTYa_#S#lA7|4S zAvUDcMV_3FZ_Ef!uq@=?y>I2+0`8}6@CJ*mC8k|o>r1qE$imTNKqxP`)mUf9rS4s* zY)z%AYZtPODEeTJfrUvZ&rrgODPjcvolX9mrjf$kwxht2d*LKxw?;J%F5y_*+b-xH zB*J%SnYNqLp3RYCaKqeHQ!AWmp>T?WjJXgCOr6D=@`El~a zi13!4Vvbpqq$H+0uI!^=9$$jlHToc-LTZ&z9po&DM>RbL5!t6Zt8ZB|DCLqDd`KA` zIy&#STv42jlYT{QKyd5*_C%^|SUiOk(fF&HSN#|i0eH8$oanvzL)1x&j%I!6EE4!k zqZmShPdDqPB5#2e%pCFsw|jrCn(?F1>`D@K@9AqZc_u5-sh`=}AwF!|A*$XgV%QOc z8zA2D=&|^~XB_^Q&Qbm{>uT%ncx0Xzhn|~MH@PgbfseLLH?h53AbR?hd;>96kafP+ zb>8tKj;j%%k`~!p^%?2xTxIumE*5mo(UZ$*DJhJ+1~(9BACn{XaN*Ug zBFxS7gb30L$&JQmfBTRyXRsFvKWn)*Tk2RV%>_=&JV5Ojz&Wk&$0hN~I*{ z*1O4JF*qs3ZAfd{FD>auh)6$}E+<5wL9&!#{O-`R#bDOfy_Ma1Vif|5ahDVUC}eu8B?WwPBt03L45PmTQ)UXU z=WfE36A|5_9olu9jeCpzTy#&lZoj^4Q{<FltIr} za7SN>53-4cA8avlSPe{|*xgah+v6uG8~^ywdt4RV+WVOpS;)v~7!>_diksE%nIfY% z-mYzNCe;z;u3B(zJ{?0Lx_!N_wf+sd7$XY>-c^P^*0cf{EnWjsIOQS}SiW2H_le+lD8BUi(TNT0dkMu|yDHJ8XaSB!g2zyGZw8C-_3t z3UoeVOiduUt2VvfC1ktkR7X`?O@BPlgXbV?v%E=OYl2tjl5;T_FyEbF0qr~(=k*3)CPZN|zh%xacMu?W++&Twpe#f}h49=!^Qf`#_}`i5OSzqXjvdP!P3_e@c;P1l z^g8dwPxe=rhRPvak*%74P}T>)&-j3<&N@KidzA^>&=v>vVYfgVWGKi(8Z_R$&Y?M~ z$C{^5fLx?O&)q`Grk-*8_@iVIGcpg^KSnSKkIwa)2n@aCYwyjZiq>eLFatdnr3L+QsPM*?syr^W$ncLO(J_A^ z>BHxveISmNzKizK(q)N|yr_S!6ku?EO>Fp!lE}7?>uPR!b;$m4muKz9O(Al|0w{7y zRz>gtGr#<`Roz9@QYqk+uTjwya!y$|vJJ%bGK;~mj&X#H%vD4FIMCEtIDdDO9)}#R z`E5|*#dooc|6v=0fd2vH5MVp70ZQ&v7~#E{@4NePnWTDSwDdc1t6&x<0c@@1TSeP- zldC>}v|^#jJ&7loeFn1L->EO;gD-TeOUjE%yDag0-qr}2%MSNKse}hFEC#;(m1L*i z`WhhOGgI=7tNHd4?`%wd<;W5lHZpi3t+LDdXQ>^)A|MkzM@=(JLm#d^YFSe9JW(ls z*@!9d^4n#{=9{WVA+*c>ywJ9f__OV06JSPjw<_OncN%f*aZ_9g6nv7TC#P%B+z!HC zwe%wceIa;aHO|kLXl}uSjq2Ub`L4fo_+Yxw`h8!73hw{dQs=+4<&Rle%*NRbG-fO#U2t^Q|Aq5`5Lqw2JBS`}pxDO!D@^RCe$NbAQcse*o2<-P?ff5YCE zT%LL8jaku?DtkI4^U{wO*7Xlp%r~HmI391r7hiMOoLpDpk)WL`VW#W^q{`B|DkPY<(TNy@g2^a`*cTqc65TxiGi{(A(pj*yo!J^2CO%K)a1WNf!yP}2M z!7xRQ?7eFvY9TPi^Pocti>)3gdXd5tOEgSg1SH~nY5yvbk(enT2Cy`> z7jw7^gamICd7QZj7)mIo@be#j_9DLnUsx5AHNVt z0E}zgCB(mA3oG_hL)O|SV=WaTgovms`aW8kA#J0t_=1$<>yDZ^kW^;62*<;peXSwn zeC?aJ8wDQ$;gVtR6JBRHz%4sZW7V4FPmP!&gqS2r%}xfqFtEUlz~UPR8xyPGh=3Yb zM^B<_Z-v%HWuS5n72+t-R@PQ2Yn%AbpI$KLbhX0X)Efx%qf^sY=xK;H&fQ z;&DM;%0TY|k;s#2gv1-IbyG@dfXnvU^K|Mw#Hu+R7Yr4g=@j_@ivAn*(G(FO7Tg1E zUVnji$NSX17k136dy$UxBwyfL`GiUuPf_k?|BT)l=xjDv#E%0Lxb*$1<|E{hEuTQX zt|Yv6SlCLvdw;Gt`OJJ3VK|u`4UE?IclOE&G&o+L$x6Izgq!TLlBHlZc{i4jF|!M# zox=7Evf-jigI&n%w)7_}}nI zV*+}+iqJ=lbJZ1_wWrl8$lc_NYJ5*MImYkt8Lr$Q&KdlEW7)|71G@<6N{v*&}Me)T>ds?;alHwsCYLB*x1pk7uB))zN70cr}>cs{u zpLfAqU6uuyl2sdztneL4CS9jRyM*V9Du-N_mCb9Pll zwjp`&L3Sl=LwBXPS&GXn#uk&D7s)*R*@p(l?fMjw-uwU;1wYpUwb_~E?;Q#yXe-ru`fNU@+VeQ998_}xTDAEYbmhClBU%V~Cz)|#{-D|DzFAl2;?PCV3_7r4z z$?aEdZC{+M&AGyCW~p|LfX^NzkVZtVmQT853xn1m<7@k*qL@@o18t~&(P;kVjE@;1 zFeQrIYTWpeowt)9_4+aN=&{EHqCuTM;-S3^!NxWU_~V~yL%Et2z1BCcnPIOD;jn+L zKWm^tdfjzvjs)&RaBqhX2c8)9mhXFUm)NQRPETkgi;@G)>2PaAxk}xu0q})r6{rFs zCsO?)(QO9-B_!|#x?JbbHteng1_X8B?;p?PNX8sYB4|(&&emt0=2j zN$eNy@eaeLBaTZ!$T|L=zd5{D<(EmJx*qKB$BaA&7mHTsEn1Ef_6q)GMuc*0W9@z4 z9q82$P*o5FICo%-v9aMlfpxOcHe+$Kj}+ z-gO+emSr_)Pei%u8?ZjV>Ok)R{csx7#O~*zp2!SdyibJ)8 z1z+3HBYJ1hOR${z<~y0}gkXyO%BGUR+ph!FyIz(6QSK4q>6Ad(8p};Hh;F}5uJrSb zOG?7)CG$!Ob$W5z#FDTDX-qXvCvso%Y%HQoCAE$w&9WpCGE%e>g_r^@Y#HaQAQ>Uf z7}GJ()nFblJBgmCVT5%+gg$2gjO)^LxGwWHab4SNB39}cy?gG1E%mZv!wR=}*k)cDv<b(D=$!Pid6z;JB76JHu8q1G!t{F*FM?|Xw|Yx_s`4jL{g z0y9DW+dbk%%puTA8!(*Q8sjzV{fx8zR#f%-fr1J@9@&w|Fj~7?JVa1(aI2-xjmu0K zDeQ60Hw~YFt_L~Z6jqtbU=+mUdvkrSrY1&kafs=L*nCW`XKV?}Q3jhAdqA)o0|MbFws?R)CI6&~fBjXX(81Jbq$ z8V~xSnPw85HTQshb6vbTQ*8rXSH|IJ75|aAe5An=P4V){xyPtYlxGHaJ`;mcAEu#RAKvb(w@QXW zeIhEcD1Q!;r?d|_Q4`hNflOAwh9PSy+w5(BiO?}1+`Hz%Sj(RMR=^0SVqK3(tVkQ$ zi^VWfSx*g-*Q7&zFP-Z#sQha~toJgTy)Uw(O-R~_cv#HZJ}ML_=u@7NFFe2$=bF<7 z?08(MW(TGV%9R}M*M5w{+e2;9j+a0QaUbkS3GPbiwUt)4>2mq@P!>-Qe#Y^;Dx=J&m!5OHV#H@Ky_RXA ze`_2HKM%!OnsqhE-lke&^gBh`d|V?}c(*5cbJN7U&QD;507^ZdkV%V_AI5 z;r>&8(nuF$bZw+qw6Iwthm7JfqU{sJ!f6S=iCFIGliW8hKssVu)-@nT@%JvRU2 z0gU!S2$EyO{Gr&mOaSjQ^OO+-jp^Y#F0>W4Y9hT5&rId#tPxP>@Pn{prNE?9!8WvF;2bh6_siB-{dk-uRB<7S(MMinvMEuxT0t6|)KR%5^m*`>QSfReZK$N9A*dYr{9X+pP`8(trMOE+0OwgJOD25n1;hkhL3kZ+n4+921Zx(PX=>7tK zY8ODIwNp`~Wc;)VxuH$YW*RksJl;KXFK%nZzE0M9oki?BVuDM*#p%*@*%vjOHv8?7 z8I=2CU9JFy`0(CH@|zt9hqG!s9QAGrq4%!kIc#X@S94`r0*Q=pAYH(6S0ZM`PWzn+ zz$&gB%%o9r`8-9Tot7Y!VWsIK@F@kSl?2i!qJtP#8n9^gC?11bqLwn0k~i$Z@BD+e zPQ5B+^OS*KiKx&=ENp$dX49R+`OZK#Skw&thm5;a%LOb+PTzV+3MuldHtu_tVzUgW zwUWZe%e1=Pxmc8V+f3GMg#kEiK~;$;dp=f!_@IB=6XMnewisyc)K%Vf9%BjlRRGpM z|DS;(_TNnu2rAM>#B%6tXuEAo>u6uC2IS9iAK6&B)nO(-xZ_}BGm2^N){JtI&rZ*i zcp1i1=2vIwppw3kk7~IMms=&jD18+ze OHww-r+`UMcIn8MM0+dr1v?>fZ& z+lDYH?zAg(>4hAzKuu!#bp}&?1QWDDCMZpIf$?OPWmluuQ1HLBl)e>q5oVmEG0~<#3h<86%TefQ5-Q?Y^+3M z-?#ybS^h&yC`-|+4?Yolnf;oD!$E36v>g~QCzJ`^A>b`;mUIQ@))5|!=SimL3Qp4$ zovz-7Kn8nTD7?YcbT4-TuDQYjT6V8t9uz1}v{_XbRH98NpF~yAEN(HN#ehu$HDq+z zn{O2XI_GN>0pe(QJ1n)h?`XhKzKsPhS+KGJFg%*YC!f48FXfZbRZRIX(e6s2vVx8`z6j5<9V~P znw2FOiHRuaE0mJOEOkG)Dl*2WXR2YKu>rLO2wE>u9KkyYMoF}beVvO^;oT0Ni4cLw zIOjsZ82pDix8tWLlh{^CK8ac#%1^G7tpI)INyV2`??fdpmZ$3KE@vTFC_T*QJK|Yo z?}7%?oEk9#Tl?tB?#HshKW714rx9CAW!o0WtpuzkZH{dhC|%H%P(4wIdw`Y>hScKn zFj%DZD3ThIf7JNZF!D)Gu$pHXAwn)FN;?R2P6hh`*RV#T9%z7!){MrY3;WQ?7iOB@ z7s13TYI@WJ`loGBUfS~mcIce2oD9du~2!b(WscU?a-M$kp8NAgvnPk>bTtijvfxI^^&QR-Ug z{2LHVK`R?7Ft=__-(jTJ9E>8NDUcBY7ObiNNAF>PR;QQ{Wd>bGpC;5l$3=^}{L>^6=I7%I)IL;FgjHm6K4>yo z*5r|@ijO~T3BnN@Yzz^ik1OcBc0U28u_p(YvM7<%NS1~UL!CAZ=o}Q{I2)38*h-)6 z)`9hi1GMpLRgd z5=6q`AndTVMIlhQ3kD=Mn3a;urFN=l7vQiKx+${yFBBRxf2V&~8gnem$CBMWL^m}0UB++LUMs|RNoh5L+ zA^J(4I&o0=a5pQ;f#AQ4Nq_*ob2|vd=&%;PSr8WdA!I(x>8ZqsQvew4Nzb~NqFL%k ziCM9lHh~w^9MyrKYKOEluTd05LNxkzvElbybq(lPYZXgLt4ck^WS@FIwCyomHSxF& zaT$c*Ie;z{u*o+AVT&8RkeqP?x!dIk1Od*>nU+Fu+`x7OAHOb>%d=0aXV}Ixq4KvJ zfUF7-MR#9xL+ftL6hVALt$gN*o+M?U!d(aY3Li!4K!bp+e2@KX5ASvRA+04ad>y?q zsSjvui+CiUyB~hqdj;4=NpCm95MvM`qoFo7I$%IR-yGV}fOgE8Q8i{~qL+_h^~5d$ z#lRLiP>4KY^uFS5hKOL=0FBh0o`5X@TvE=DDO21Hg-AdFpcu+8KTbY1vPdd6=39bK zRso-P3nV2AJ#5WjN$>@E(%fiIh|+jLM;2|4lqlB?%R+Xi`~HCHsS1~A;7oORJ^2Fg(@+rqnW#T}fU(PBuuz?)lEZ_w zBKeV2{BV##B%aTC%-N7FU*+et!6OjVaocXTY>#3K?L%l)+Ob(F!UgEn>u3IiJ@9EX zzo;EeK$QK5+QIZf^)L64D?8%Z;#`xok>G=*9S7BoE6nmLT9O~r$?OZf$Q4uMNc7Z;bMutk?>(Q}c+kj15|cXOc@?J@rGX-*h~$^XhUimHbp~RV zhbw@t9w^%t96`_Tn7|p)S*YgQ>iCK^yBbn0RFf=(!7}U;r_B~`=sZnzc^4aUE0|ER zx!7yA=sWwaV^ow?VP8RU<^0LZ^-0fn z(+#b`->T*`<0x$qBp_OF>$5bYBh3MjPC;#IY9&w`3pv#1;J$4^?jdgoF1(2(&ef@088-WytZm*lZ(EB5^rhq z$;C95_;TZ3yJN3-p~=#`Q|Xgg)Aad=&Rat7!=(o-#+ zy}Nm590(51u(!sn>O`Hhe?ah^2y%5O>F@@qAn`hUA^7uRaJj=*J$0Ry2}MAGRcNIY zBSHq5I{~HG)pb;b37I&R&#edOLr;Xy&s+#C41O*|LU9!8k9`3J&{p=)jtnv?)o>G_z`tM)8k;=E3IJ_+R%OH#7x+N;6}V zE%#Zqpb5x8RGwILj)YHDVagJSEhT1u{gmpSzZneiWNbs}t$%Dfnh#;sfS+A-+m zu4O2G7=0~?nOrCg!V$TXE;09}0R47jdD#zBux-?5UxB!ldJ7&rQ3q0$RpWzT-7edk zKS0D==>)1I+UW_<8AOk0?`xX*uvg*2?SwD!sYmW9F!Us3=wG2BCk$##s{FD3pH$^X zQ}Js=-~xzlSt7uOk3L_Mau@~sZ%VD8r6j(UTd4cp?!nWhrZY_E#82`18oo$mdd zQ+f_G6=@2j<@9VKco+aS2x)^(*RAHDh%_q++Pb}7`RkX+u!!B``}hesmXcia?wWgN z+BD0*hajA`)65l57Ve@Ea~|g%1EuBuQD+dPE`y}~20ZL(0Gu!(R+ea*X8O_-{d8)^leO5y ze-slU?4N)p+*@yJ6Bf{Jq(Y$83;9Ik+mDtO?azH#^Y@|M7-$3=Xp*7=EKuT z?g#2Lw-c(Az|LM(q6;JC`exO{I#9cA1s0MSA_-6g{vB?3N41Zo1@rtfK6!8XO za7~9t{~kQ(JE21f3ZTrIN(fIxvRt%&)c_LU)1y}LBxJ?@K*0QS*3oE~A7z@S?GN;x zz5uaXk|vUQDOQF0#msn@1UR_2oU6|c#_|AISD)EB)GBr`wfj^(Nn!$0827SH&O(tU zc2qw~HVNqq()PZ1VV#YGh5Qf7ydOcM9#g#zp$uX?@tvN7Y+^aMjm>Aw3&Y$r2ugw_ zpD_Z4R^iT^QRO%^mjC%PDb$RM@?79p^|o)l2FvG&i|oU9bl;S0iV7z$23UF^GC45 zIEMzb>AXSGCG>>{+p~=A@qkHuN+KeJoyfBGhd-HuN8WL{wBkliT~4_Xzk3`bO~WGeeQ8`OL_zx?>|*zA@*yQPDQvUANYy>|#E z5CjCTWIAWz=wGfO=wlzQU7jZ;i(+VrNll)Ah-rp#g4N?&#BA_{#QgrKUq!`8p+D}4 z8g53LE_F~Ln|kBJ#Eim`S2O9#p*bL_PPiXVOfJ3`uu9681p|NFhxUUC%VVUU2p$(w z>5W|(<82Ykpe*Fxs+-Qy7yB&&&goGfYe^JL52Isg83b>)X4gXdt1U6j^Qo@21ZNI! zg20wZSKAF>r3d0(a=|p}sgxym&T`0-wGBN7!MS$EYm%CV3Q^w2oU5a8o1lT+a&3KV z(!+lAG6f^1ib1&E;LncqD26Z(dz}hq57TKc4hl~URc1qlW!6ElfzgH7I%Im-1D`1y zK)moRog2C3m$qlo0iK&k3)vPtLi&Q@Eb6E@ea}v*wKV)?t`zr&B)`y~1&`5?O@gDq z2kt)MFT|PnH2u)Hks$DA0ZJ2J@I*$jLqau`G7wjrjo1%6|E@>HTwHkVJXbZKcIH4e z@Q%M2BbljLaFjFd*sX)!hnt=ugK|&#n+?RIqw;=IIsZ+|Evyh5UCuepAg*yfNHjg| zRrHSQSwhx#aMZ+EmdgoI(AV8g;V!`BbXwU3tm{WFs1+Fm3b@fl^c{G%ut{|neoK8G zo1Y_}3-*p99+`OudeEyD%bHDgPq15^!yELjjd?2wsUyCL^m~`}uwn(Vt%b(nT>=O@ z0zTZDUwiW9MS-ptB;y!QlZ$-XMRhl$pf7>Rs=f7^3-{t~TYC`h)q9lq*m(>4j}yA& zwSeM#mPl4Bhw87WbFVXs;)nN@+_h3YO{N=8j(}wDHRz4*QtTEK)_)~C%TtGvOp6;P+bPsgZ490--F>>62I7yuQQW z%Lkkf`XX>Xp>2wRo?8%G@M7GRav3d7DNZN-tM~E91W!%Jpq~hZdolQ^n1dr{4y%H$ zTC`2c31a;8BrOsoeXXI&d?Lh#sggLx`XlZN^TGoe+zP+h58|G+W5vc{`u<{7Iq^VF z|52e{%*ZQH**DlI(Kgy1_;g}yiCM?nFC&!?wj{(}QZk(tsgp%J?L4OOo|cD+Eiydi z@yqrtI|wg5FX|r5eIkfsK~+Gj+TT66{NBoKj>2Q|2)QJleDX{_7nFmgCrZP%1bAGE z3m}aGoo4R6lG3I|i5FNR1%EBmfIbSoXGgeq39={5W4=JY?QprLF`wgVAa2TdKvNne zXg?kl`r%R-!Mm#$6H$pp5vx#QPHD;i@CGE(B^Sd7Pq9_+->AX9*mU26 z_{5|`;|b_CQHjf?R^0BdwYW|Z8bZ?HsU8U$Y4qL>hT365g`IJ8EQ{eX+6E8Y-k>%+ zd+>`B7;Xmn`3vMqa7A2*J2Y(&EOHDlWYBY~FEPu(U3_ZLWIo=@>hFiICfQQBm2EXy zJtAvmkQ5IDNK^gVS;TxRZ{3i1T<7j-3wC`kaO)M6WPX(dz z0f??cM51p2rRrA_ZKnp3yL5W$X)k#tJ{HP<2(J|k`=T%l7~o(+>$^9SI1IdX0K|H; z^NDxN8x{kLO5)uL_^Ub!@E;@h|C60>!yiuoQ60_K+!y61eeTOYd7CVYrMR02aU^do z1tNc!f$x)U02L$(^ZbA~W zZJ3rzOc(Fc5O`Yg&}bbhn#-!_y);7R4n5K&fg##}9RJ5aK{2Rd{yk72LWUOJ#!<5u zk$glKR&jwl3yn}KE_?=G2*lTT>832879hVCcIEaicV}xmmF>>hD{;QmYBOKkKwu;w zzZjAEkCC|H$WgQW3RxQWel1YrdI-FZcg`iFCtIoq+K2^AT+-=0K(K92dq4hN@Byg# z3))g{>56sJM?|u6z>L}7)VrPS51j>i<-4z&&B$lZgE|9`Oj;10STCXFxw|pHiat z1@J0MfI_i9b;^DT5G6yYd5j@ROo;-dEx``G1gcnO*+#T_!umY+6Kd~-ExZuLH@g>5 ztDs5`sCIf*Sg$i#BudJK>jeksXKk$vTpf=fumVv>R-I3>cJNgKplPcmFYdpdep81Z zibpiQ1ZS$28?EGIrY#U`RwXlHdA|J?4$8PpsO$xc52i5Mq_}OzY7Qq?B^f;C8EE9} z|B~U6;^uFLG^kf~_#leX<5t2Y3XI*RmfIW+noU?RIf66I{!B69>9f9K}}U| zHpE_cH6?k{Wi{6l9D7n!#Fo_C@4MObk=}s8(Yz~BNX5q)`0R~7;InG~^x4N})ZmD> z=Wk<|WU_#>s&}M9y3i@>j*0h)9)2*eAhfv;-$8vR##j;vtgpW<-U6KH^{FSqSf*7# zlj7NYJwe^cQp#m8X7O3ua;FSq@gO6vGhpU@6%wY+rEK0x{e&nfWPIjh?DqhTGYykKXMAAJq&r z839F%p>Bx*#7#7jqCKW%y=w6Kf+%e}!K(J>t{^|mhlSw=llXoFs`(S2pm}WLIrbTH#N?;1Tp$f?UAtp_H$(sB) z=7r_9j}5)(^KE({e){DvzZt%Ffd3xl_jnZplc6ueG}!6d>4PO64f@1C%P*t7eO!0WBW~w$ceE zq(P?U@|c&3@WU5`dT@#wr6`he4xHJzJr0X8i3&_}5u2X(B(GVNmf`W(*RcO(TAY!X zIzS9HGzc0f53u$ZA*0-S1<5{;_>$FQU$y8v1&}97CAMtiUpIhUPqFC}e^0EwrQ^@0|q)Anhmg1Q10mG8zdp zs;%c=V&|tc5FBAqh@WS#T&A}g^L?5Da_U{{w>Mg?l!Mg#8I2fvc)CR9h+J0zw`3(s z7s?Ah0_3Pa+{E2&UH2a%K+yJ!af2ui!Z}$*M$@A=D3NE{U-J3RbP*$OuFCd+gDD>; zh-KkHe(isdyAX8!qDL@xFkf5gX^!|uK1{$eM#&JIwe+f7`Z zY7Ps@&WrEQ-=2D$ogBM1eJda6G$lm?jp;uV1Y!x#XKJ>-;fSU2&IMUFV9$dQhRQ6J z%#*5jbYi3e>MH3wexHqEDdfsle0}2g; zdR>l@CH+m#c)?3=sBUDTQZoaL#+1^e;u#>yeZD|dyufM#$VbqCq3|FZ&{yoL-YM7s z?+yC55Sfk|{nAH(#^GZWN${s>#LyC!2>XBd`tES7`}c1W%3imfoe|mDGcvL&4I?9D z%gzpE@0G|(M5V~i-djeB>^-vg9?$uqyWjUWj_05Ia96ml&-EVX`C8}qRn%DoP*lu{ z)!mR{>JgmjOj*f!8!zrwBsmq^U=4Ju<={=d0_x&RR7N4!ibtxuw%2w~5h>}IbScTXoWa!mr-~mg$yBxJg za)Q16YoHvDoV(fs_;Pq4*XIR0?T_w)!-suD@g%_0%Mr7?5iG0->{j#!xzP571$6R^ zU#(=g8Ejxk%FeYx*AGv&|2eXJTaTL8lQz{TCC1La!;L1usJ9Pco3x88Z6Erdi*G+1 z;fN1;GR8-}-Ta!PpinlOqN?=J$ocf7p#EYRjl2j0cy}*u2+0!2p*T^b9e$nt3MxnB z9vY{mnp%Hb4kt7&U~%IS`EOb4#?t%4#+-Yz6RT)TC_ig&}>I+j&IWYGt9^D9TX?!?enBKS#WoEXoW!osc z=P?Ikz{b(BZ-!joN6+yI7y@j(&a7!vVaQAj5;JJS9smP{n1_PT%qj4ZU9D%Ex{xXf zm{o(eO5<~M?8CT9z#w%}Bs?fdx$dv6gRn*^4w#WMIJQp{r1O1!RmYA_Jt~4F_8r2R z#sN?N*VB<_h>FLrTJ`1H{Z;}(Z>z-W^?{c>No4er=ncba&7NL5T( zbPv-hCicO+a&GlWDV)yNY+98R06>3?=4i|iZ-eEwy=ZG)xtDSBXf@B xesOcP~Q zX6|D!XK6EWD^bNfBKCNA>%P9$3V-aO>|o|8`|v}+$EZ%oXc8SyYMJKAOueiNwgu~m zIWPI>W=R3I3u9E9cEjG6LV-k`9l*h)uB{vB&~dfJc81NoT?GW zrGb0r)|Pv^>7w?Y%t)#pzr)q^RcSW)e(YJ;YKrV4*t&n#7(7M#&y`!Bt@qsAZQtX9 z6)(3x$8Bd>LJL=bYD50M8_`{jCLT{4Nt@nuX;ZK`xoGf*5-q*wg7Xl6^~l=Dz9n3N8Rw%!!G7U+38IJ$)OJe zA?k(V^LIWkY2ZWxl};o=Y2*Otx$X5w=cR+JhQ4gdAl4AS9o-oA zNJ5Yyo|Nr!;&qOXS%sSGlA*eYXNv>brDNT|HQvNc{pz;!q}Z~b*FUtHGUOWTPC>5O zE8%8n6%WjbS`zu`%{zvjMwjYz^&n*>nfj)^wb{RlgBiTP;)ySn#;O9q*#^U_{p&16 z9xJ1@EN_C~?|kI1dMylbjwv$t2>F(wk)P=I<%?k6x_UHP*1xX6)^bhlD z!Vm@5JVn*yPSN0#jq$XP-*^UOc}v$*{b}rS%GZNV!7T8b9BxsES~9|ZlKgq*fa4fM ztPb+>AnS`#f3nH2Q$RTEr&5kY<3J&P?p9kCb7cDfX?eet$Lg5$ivQR7k6A--=^ef< z9+6`_#mPakxS)Sk^WT%;RCD&<&Bpl1-?=bFJ-*l@nTj|;oXvLMB1cjXy!CT<@VC^* zobjLP+3?hHL{)>=Z@$;L{`B3&!G@FkuPU`dK)A9JMdGvwLU<5uP~ySIrm#!e-7hF5 z5uL)#v{f5!?ra+a#2@1Cg(?5^i!`0RT}pH_!bCt76sZ@n&ZZ(vbUWl;YaK@L`nU7n zniY)hRl+wDcAs9cA5Jldq(Fs?rHiwza)rL%gX_i+JmFN>*%>Y5_R@ zD89%bdey!Y!m?ysIUl}yGss%u0*1VAZ-J~f(}?r*eutae3OPW~$}X;hDvNWx7V-Rd z$^Sh<5!*sz4eP&0b`3p`Kp%@(2$U_+r)9gHG*dAL<*}eQjlRu=O!}Wuop&qqE{wCx zIHddGgm@~{#o6hjaPu_!*!8kv@@f#|+H}0;H_}Jx5*SK-1MhXJ@t@O-iOeIia2Gf=FuYW~bkY=4#XnzL8>X{4 zWahC{*jEa&%JjqSRqIU(>N%dJ=crQTo8-Ro!oO(? zM9Way!IX)0qTz9M5aXT)>sVo!YDSq^-vHj7H zcX`xvpJ)2>!}h+qw8%tQa=SJ|ATUp(Zq}E` z@qLiyx?1UnqxxxQv7Zl19nr??4}0qd)R1VW5XbWHvdSCCq?Gc^h;(Xx z5eVC2%-;?)BinpPN%70_P_M8u0Q6}BMVJeRafSW}Ac`i*tI_o9us}0J?eJ@CgEV_& zyX-k3o=dS8y;8_CZ$ZKidMk5A-OM{q0F3h=1=KregxaM?dgs=n$S6)48Lhc}xhzKJ z$v9E zl9s{l;Q&JG?Ojhmp3HDFljP*m*5Px0^8A9xfHQS)sp*(^8WD##21)0 zx{SSX3xA(xIv^J_vL>!^Ie;Due&;76n@|j@H|G7xKso3bDA;Zq6?V#eJOm)y$sg~M zo!5)3~FP}0l!U7vuG^!s2zE>m5{I}ynr|4@|+$(8o5N?HWK^7 zh(G~3qM60A)xSY*IO?F@mQ(Zk#U7(^opauX|p({9eeN9^2Ub z0FH>waKp!bfi@EkbLp+`mW5eR7x_~hj zzlUwXIFtd362>N%FwSE~lG|Mc+mCX$01x()&{F+?E~^DX2ZAOw@vWFa&@i0nG2fLV=L4M_(?3P}AZe57%tU)^u4^~FAVvW4`@#9RfBqxi4e{9wnlCsxfyy!OCP z?dwbkd0u#@@&lDRDBYDeDo?=mT{usQ(dE4eYbPe97$i~S1#xD(uaISaN6Rm}_B5U; zb^(G>T6Q!K?a%_8YuPCa%Bi0h7;n2iI0&0+U73u<> zlVUEl-&>!40#?{(d^{cO0$BMxCS7l&t**2jz#FZ89$h6?;ONZafv21+Y0#Vweia>Q zpSU>Uh24u9PDX2dOz&rJU`SB++D)_cM)PKPd{feyTiDe%P`JA2FjLE)z-~UChS(+E zr0}scC2DjA<3lRoA7ce8CSt6ZOz2x3&C4)qZ?IhKL2MYOvbRoS&W5ny{;%)Akp;fH zkn-K1JwF~p1_K@~x4;pUF-Qe+ey2e!AcwB83STpH^jc$4I(1Vki>}h_hSQ(0q<#oX zGJ6cg#VN~i33{_B#7~&|RipfoSgy1ID5VgO)Ec%(qK9VEW;0h5s#g?U!x#h=f}+#9 zwO0?(A^Q2%X5GV^$pE8h8x@Zaf0Kg#_|vrpvNEu`V3|C=O7Yq2U32{v-O5_PzoAP1 z4+$ScW@tXsUSgTi0enN^-}s9JFdu??!-zrnM;-|SO%v&Z&tr-8wn$<@@u8mk2m+%U z$&fO{cM>){S26rlsHxs4bO2?+QWV+xOIstvEwPVSMgK29;~&wR%UV_@eN=9LKt>VR zIIS@|5`}xL$CUBzyhW<7rL6H_&<-^ro#(&$)$|I`?$HwbD!`OX6!C5caF)mb)h>`Y zFj0b2GV(*}YLF2MBJ6Rce8(o)y&Y)dLhzc0g_EB|B?PCDW>TUgn$MW*13atyWK-Et ziG9|s7)YV6c4??+2gQ)Bj317fy;k>DQM8xgcq<6b0b=ItAN+JgJd&$#^Dd^!&BD#WQfWQ``487F^#4Oo%3^Us6OgGKAPa7qfp4 z;dr(d`pQb|XZMLKi9P^&f|c`n;&KSR9CjE-z!OzEVy&{?e{g>qrRUuEEot}<*B$t@ z-~Tt~#$S@=3~Io022Hu}k8;Xl;|tOnaEw_G0UTyekqzQbT1@2 zLPC0ipLg6Gv8^yhjat#*PV1aTQSD68NVxw}Ns$=Fy(F@!lm5NR;XVU}3A*1^DnXlU ztAT=+%Pd)v>yhERxlNq;9KTH=3S$~Xy!J!E9YfV}(d%<)FCPP-&G1$_2+4>NE`SaW z*uuSC$O{JtiFT9 z1WAK7iWX^fGAIFQISWMlw>3vTy>Li2PzNI2_CjEtIN3uRh`Pu#TMC=P64RpHxPHze z23`ZC)W)Z&FxgMUMf(Dk?nJJ~l=#x;%S^H%bu7pF#b_H4l~YA7652`d6RQTurB;p} z?R@0=7t6hfN@Wij44cY~FVLAVsLzwO(h^ai?SN(2Av4h~NOCxshZ`bSB}ucMzQynq zdo64CB8SvEvqr}KP3sE`TXWzpzx)?;26;^#lV!`LZ7GYTb8myS9NparTL@xc(4u2B zv&wom?Ux=nN<8kpIDhKHG9`v~`j0lNj^?mYuyR~ab26{UDBD+=xfWNzChe3+Gqq+A zuDPfjNK&tL?$Y+G=ke@+(UPFo!>|p?sRoq0=+PoyI1=sjAb);ilp-K|A&zSU|Bl%^(Ew00; zTYL-@B-d-RLvX{9eHsmT!Ji9OBQuZi9TMKs`SRwk+BxdRaTwRZDS2yJ5m}Lu1Ydb> zN6J5;CG#c&zuGg9^YoF5;>z_(DB__mOP}`d;jxh2n{eIttDs}i#;6cV&H#bCx1tr zfg+8DBJ_Ho8IrIFntNE59u+`A`oxTuyW}tY%zA~tAn3fR&x1FpG@g6G5R;OA$u{iY zY`J21Y)$|5ent2q4_n!DzNQ>bu^gZ0cB;;2%Yi$4;Q8LJ^~tvNFcODQ?y@mdVAkUv z@64qH$I)A3MD7WzRaw(8mvaHq({>I0sPQBh^FArQ5c(N$Ta}5$QJQn|i|*0xL!v-w zBmt6@*kUD2iYb3YjY0A5VD+az(Q*|3ynEm-WQk?`Iqn)5(kPJxq^s|}F;Amsvf)>7 zG^eJlR@o`AT6~<9S7iqa-`}W4Hygr0qkXX@WuI%7i>-CfiBR{9;H;sc#4Ip4f+7tG z?_m|Y@Qx|*jWgf0jBdNl{9I+ZLqL!UJ=-aHXU5KsjpcoztJK=^XD z9ufWR8&SyLSfXfS-89c^<8ST*!es*k1C0qQbqyg0Kx+)-&|(H>FN)N|Q+FmM#0`Rp zxkH)BFl)$Z)Pp91%Y7VM!a&K5sD)$-=$z$ld0I!L;VK?}njjfJWQz7CN9rXae7W_7 zeF+w#EE*j3Xvx3sBWD2}RWxzSc+Qr6ga!=0+3u?vqZJzWAr^BXk{NpnbZk35jo4h8 z6+yoNxRs928iDeglhB;1%4WU^k)sq zGPpoU#SioLp>%*rLY;(x0=*{AkfYJ3Qz%xJ_m>Q{t;s$T z;ANbw{iLLUxaG%X0g?bY9S96fY6mWx2NBwy-lE7;RX>?}GJ)iH`BPh2GsN>Qfxb^1 zGA57jj<86qU92LXTKR?~vlvj;e}E<>?yu=LTa$s}&}L)^3^T`Ag}VLd_V|ET4c5x@0n>;?RnPZlNQl<-j|k~K*yR_HUHLbn^nXSqhX1u`S@z#CPJ@eTJ4 z4j*kIQ9<`s+&(ZxdE;GfzP|87T}{RD5ky$!)%|-=FMWq++{k4}oJm?%hS)LbrfnDi zCj1$$Qrq~L{s-}0I5f4e!hIV6?;U=7Be+~9jJT=2q$g9&iI_!o)3J-g|8%QwfpEg? zCn9_Lk1ltLzTn#B(^3WL5C3#{4@Y#=A#IDMV=sGIg>)k~34&A_Si zblh-ZHV|H|Mi*O2L7(*V9apOVBLt zx@m)m4{o0m3Qg?m>h&4b1-&pR+)~0VrDV~T@6w7mm?jY4JKd}h#AgkRcNoz4e{6Cb%>6A;13uzG^TY}0V+;_E?5q{RWhf!M*nUkk;-_WAryF7XQS?eWM}ei> zJ5SSzTYg&HF(*?XM>4n~8jl)JFJp0tA3hUIAw}{WxU6{y*CEK{v~oVPB7^2gU7MjY z(Rm^;j5Q_{$)8MpNYHXsuIq%kp_XofMa7Ug<%9cW4kF1Rt9T@-2ZOK`;|{aMbLYpl z^2)oP6qPWhFqf1nYzBx`&HJ)-Boax2e@%vocKFXQaBRh}sSiOK7h6#KtXV?bjgR+r z__^P8@Jvx+FiZz%&V8>^;6XC8ab$^D2H@~2?P@A~y&1}5 z#*tMR)_OG3)7&uDNTHWSJ=D|qgfoPzGB_8ENKT%ezv6fd-A65%WYVb3jBd%z-~JXU zry*LKVDpO4>D$kNZ=ddweF-X2i_1+QfY1+#p+}~6@rFXJ8 zH`VtQdVRtkt9D*NGmIAwP|?i>ix}VuKecE zd<*-l+bbvY6-`ctSxvGTP%Fja^xlHlJ9nW^R)zs|`{dhJ%82^dl{=@2`b)Et4lCvo z)nwH>&s3@}M}O1Io7HFz=L@Xc_qFOg+eAWm)JdvDHtcXReqj`VOnm+qo#qj4h~vOZ z5=KU@XGWo=0C}ffJN6^jPOz)r&o6Y(**fg6+~D8!UGg4fY%FR=bn@PMNo%BrO1HyK zV*~{_@@=sw%p&B>#NP!`7Xq_7{}A8W+58xFIH!=78?V&HL1Yzbtv|_*-wQG(xt> zFo*<}VFuu-GEA?-b?Lr}A9ZACO2NK7PIAnhG*0+_@1$ZQtLa$x}YsvjreTr ztRP0tzWMVlq-Qo+MXtWdi8h$w!4*5rg$g~)uyeTQ`Ihs>8ZjkJUg(AwlHg;-ko=3H z?i{x<%N0CM=Ek_;b6UJ#J?FsYy+!}o_WE_Y{D{Z?n@BN~qgydgSqE}n2mC7bS4E-y zE`jQ(GUi)LhPR8Os_C7N+;-8241^};{o((o=k`CFZ1_E5y1i&X6Ky@J38j5~n34efBoh~lU@?{%x5>BTUkz-T0#2uU=;T)^Fz zKRG_Cqvf0Wb&=NoV}Kp^w&9{7I2`&7Ruoj#@fJG}yAycA`>oHPlgv&onU2*iA5I3c z$N24O98m+0ZzarqvQEp%+LM@;!mpQIxbmuIU>#F(*P9C@MToi8`ld<7Vc#XL0_*-H|c1$dAC21OM<6Pjix1t#_)lK-x!vi>vY*|pY^&(dSPK-+G2 z$>8t3r=vZZlOIMWdw8p-=>27xZLfVq-vh|U8HA#bhbTEw`<*!u$+l*(HI$^EwF+DP zpF>Kr=#jP@6U&8{y6lSjmY&(`{gBWu+5fF&s*SU(9hX|@*0V2*AMVKYYk$b!MP$l0 z<8^;^q12Nk2_0Y_04Hqo=*so9ckO1Z-#)Q`yeQ^!S-NmwA|?^Ja(>K(T zxC`}}6H?E&Tqw*&lz|ffZk=l5pfm*(wdu8jYGu;OjZPL8QOHe>V5_*#=)I@i=4iPL zvvTY(@u*i9(yX6tB&@%9toG9=rXuMI=oWksbsFY^qbOMFA9;kTKwo$fvQ%Y8PYd6F zV=K*fBSirQ(leN(ll;{j(SkUnhRB}z#F+~It1JvF>1DxjdiGlQ>frvk6CWk-0K{ZJ zYSDilQG4s1s*o&_LEr?E6iiephP&05Up<@<+PlpTwN+&IEoUU)z&Ra4YkO~vQ1ZUM z9Ccfn)fa^Fdv4(f{#PaKLm;&U2T1bp_J;}*8f?~?tcGB+gR*Z>5JF^v<_3A>Lku}3 zh`;}9%Y`;yrRr$O`T+Ni;E;_>ohko#G;gN#m_XFpOoDYOlFg)dVmoM6d=Ix|K*5j0 zaf<~iF~kawEG`7``$C5aD^-oD%^F_~nj)50fpm>O6=hKk8BNk;byl@WfDU|Ip;=|! zpVv@<%o!=;ed+43<{h1Uq7RcwSZWAFu6SPGc|-As*Nxaje3W(182y8mSx>_lx9p91)Q+|ZcT?4wMk&3|-DGa9TnlcMZ3P-R2Y+O}3zD8tXSIdLMn~8TT5c1{gG9)iO&Xf>x6{0xK z(mQG$eJKp;M>W@9fm3(%{&)TR6^zk`W<^Zp*90ErE54{!VSFJfEgwobZLVVy1!rn0 ze2|0yVVw%!&u|4XqLu=}r+gb?^og0ON%JjJ%OLO`zB>Sd#{TDph@|)z(&|??`Eju_ zUy1pjFu+-!1|&yphXX?K;=BNMUp3N(TSp{c7aJFoDRBw17SI z+L!9u7Jb=UnGCGxR5Hl$FVFBjP;(uM$6fMOFn&nr8g|Kupk=K2P*&U~0Fm@x*RC4; z3Gc-aj-S74Pf1K@I@oG=n#;4l^MmX=GXXP#uV;|B#~CF42Y7E|j7%I#JN&N1wa0!- zg9T2MCM3G1YI|i=@BYLvEJ!6OFIvJ>4L5?4t23Kel}u%f!VZG4SO1T`^u{`)#qLj7 zaz|+ZL%i|UDRbR2ryV9*UVwuADOmkurLL_3ZgQS|+f}BFQFDSVISeDRok1er(juvg zm8%}$qG9_^@mW>jX%az)xm%trG;0$H6=V=SZ+91du4xRZ%;C4|+OHynL915ZoS~B7 ze!)V4F#x5uJCz9%Umha#%D%h&9cOZ}h>3`1pQ$vit;PPHZ@^{}x3IFqiw_Zlwh9SM z`Ln|iBJst7%Kc9o<4-F0X4*yV|8Og?G5)Aq?xfcccEs&o<8Dg~%WZRJBfepO;nc``)t@N15xa&up?iL*c98-GuAIb_?%OHoFw zFACm(EHnLvAw+Idplsg0gL68VBtgN?(gPmGVWL;sBm}Cij*DCvtVuFeoWFn-UP5|@ zr0E+X8ocles)qEl&r3ENzVS`|^R`RNTK3~A65M6cMG2BF>1 zB!0^M41TrJ1#=|Oz5gi?6xJ}HBtI_g(JcbyAb$R`ASWo{F1@XHpY2E+d{Xwgqu9Ur z^b`XJqlwvdGym^xi6Cr3Y)+?Bf7+h876ckqS^--6Otrd2dd@_Z=|M(0K_)Z;)H{$Y zJ`~+7WPYJd90YyH=sH?O`VY}SUq1+&a-*y;Z$>Oz)8aDj_YI9^jRMois@ZrI z=6sXRr*;1L)-pJztmTHcUujw0KHdT0JqL)W2_X@0+7vl8LHA39JA3g(m^%Z-u={M1 zQz?wx5PfT7gMK}>$KWE|ivRnevC8r;bX$8K3_eqxKK}a2f_7#m3Mz{ZGq~f2HY$!L7tMfi%)ae)~ zJ=cNIU>C@?fz(cfHG~AK?b~ZHF#IFmIMi(})t@?e3xPEpe}8WvZgQ}BhRl$W3#0yB zSCN5x@AIFaKnHx!iA0^;Z_75EuihfiISGK&cM|{jniC)r0G~o9%;mNFeKeFzv`L!h zx`ANRzM%7d_M@$rw{-DHK(fIl9_5r9!rr^`NT?9iQkc~(>1{Om80)@TgE@@Y)CBM9g6Ygr@NUaMF$mQ)4bJx4zn8@g`lxvIDv+GJltr-zaaQgc zAdDQXbV!!5#;bHi;EK((skf}Fj*7Iao>i1V6Z2h)?ilud0-R>@s)g*niVX8?!)a=!g(ox@;hL;}1z-JJmA z%0sR&r2P706@_KC)H?GMnS>(zQjCeLqY&S7x9-F29)y}{))}%?Kvv=T!2hszBHJ$R zw0!%kQbAq=Xp-`2vajUemP&&(3x2O#0$EBklBJ`TR;PVh#s1q5aIQ>0b|HOCAR{#Q z^;bf~51&zs7@?QNki;WsP=Dd%gyRl9RZp@+HnZLzTF5f#G@qSFv>#9;)^Z;OU~h^g zBioPazGW2Lj6N|;rt=X`P`HrVsKMwn0PRuN2URA)S`C@;=^HLhS9HI?y4WWFdz2%9 zBP&Z3eG4+KF&haF-B&&~S;{0Q5{jf`f8iEI>$b=$+n(d zc7GNl1Rb(C*4$QODnLRs5jb?t^dVz}fFr~kz9o0VK2a?U1*d3RGSWw&(R%q7;l+KC zgZ#<=G0WqvAy6f!^V`fkCh|7dFAl#^VcNh&#y%RYgU#X)l&otGz!@2nB$w|-!b}ci zW&q;g?CsR#{u2DkyUpjT=gG4j5Z3G+gq7IOf&PZ|V8ubx=sHsqfdeHk_pv5-Qw$oK z*9~SCy}XQ8yDv6|-P%^D{&g=(Sz6ilI&M<5j3&6;z*t9Sod|63tW}Y>>9_j4ymnjg z4g!rqCi)Lk!HAPg%Bt5Nu$SaLr~KWcZaHVc1gXpyTwwWwdZugYYL3P&qs?MhC*C_@ zO#B`T{%yP=W&N3#OORic$ubIS3Di1S>1?V#M{z>Xzs^lwje*8)i+keVnqEd7L@LD z7Mr1QCERTC6}d?8N#%~H=H~kXfG1hUKerxZ2d_8LWOn6yXD$B$Tun&E#EE7@%V-dQF1dAAM!Y0 ze#<()ktKu*j2gN$Qkd*Uw`eY6!cJU(&kpD`7}S@YdOCjbkVgjZ;Ii?OjBuP}UdT>5 zqN`9~Vu>@zpM0pqz;yZRPq%5lPZs;zOWG@=^Ocwxxh&q{zBu!(|0q24bwGmQcx>lC zWCoRvzo6rXc|jEo@X6>`1RU9}x3CBz9}S3hb%MluvmS8Wt~90rRAeA!3qMjbk9iUN zd1$Vi_A`!)C7%L$n(&WfkU)L76|RE3>A#!XU#5PJcFLWMG8NlBG66vK$mHjt;vDOt zl%|J;y9q+LCT@BK^%+{Ba7WtLzIdRJN~Y)DP36atF~`Ai-*y0M^Y?GlLuoZ}*7nF} zA46U=XUyfsZarNx3?XdxFQ1K@^MC0RHJG(Jv3z(ahXmYQGrGk*AYjb%KBdywMsjSGQ{(0!ssBH_ zF_zxMS~IxU326JCcX}31G(KEU-YFQ1UcORyy{1{?95#{RZ`tOB*j(ce0_W!X;W95n zLX(?L5GN=IR$dP5Wx%!Af@HZd+9~v^T$5uuIay3XPrTPK*hE{J@Gf?Fd6_fv2?2qu z<=tB+IYC(Oy%ey%>pXW5RB&=~;v7Dbs4}T&8{X)PXloSsChUS8S4|gNUpZY7{e*UkOB{Ra!+O zLCSO#S~Uqx&?QcL;106ieSb@u?!!B-+FyFo>kBM0((4viR~4o0OvSlPQsKujV@2s% zc&nM(^EmlQG!1(RTv+Kb4k0SPjECNmYT^BSs>_vC0q?WFctlA47$Nzjuro$OZFSJ% z`~a!8OZ3KkZ^@dhNclOY4Lafa82tEJyHn%IuH))U{hT^uS= zA6iJ2;vPro`_V~Ga6TR9jYjTpv4&%8XY=s^yX%c`TJelGS<((W9Xdf&T%73X7v40f z(exCy^_~BISuxp%h-JFsT=W+dOj<9<0DCo*CaPlm)vlvZ7_|5vjPR5Q93ke>M%ZrjhmaBi4rHED==Qu zH>`%!tB68Lto{*d5`CckI%=0QsB_@7qo-KaoAMR7O>TGj=2WHNIhNwhffT(8S2pb6 z?Dg8W8?&8?@fowcRdx5%V7RqmR`p$y=3Xig$r7T>6jEGl@)@4Iq2ZjNnj)ge<#Okm zcG03k!dA_BD=8@yNis7w7b&}-me>!GkuehPyIxOnu@9pcOHjDF<9SVHiY8IH^vhW9 z>zHZK{z%g(Z*J(tYPI1I$qCqJvmh$JQO#Cre}MPScp+u6-~N`RE>3!l-?_t(;c}N3 z*&7mdK=%{8>O!FhANAAzI~NQ+lw%V36ku|7&!9?AQ8#9uOk5oFKgV?myVR-!w=V?9|fhv?o?s#M^s$ zr1T4DjX5|7F$b>mbt=g=c^^EgOapb&-H&(K`qOB*DF`S%a@nU^Q-W(Xgqz%o zJz{mWdNiqk?vnAORLQwS1NxFOR7Zl+*Db-t_1?3%M~M`-=022<^M<+61en8o`Kr{=FGX=@?^Ms1F_!@$qPD``@20IceAhGP@i7J zchxl#m&2jDi9OSq7WDFCR}I>MvvjZh{#af32jeA?4BePRX<8hwrZ5V{o=nx$Lv*a~ z-HiTw=*HKjR%6SZ87`sF z+61rM6#w#|N-Pr3IuOlOpP|oE{2HGbN>XZ5LKxc0Y*!u5t`xPYWcyTu_C)nAc{Go4 zwSVzcgMtWme34-j9*mQG&>XA#t2RqEkcgKJedXpd2IKE9PWWmK-s_I<^iWV<(5&9{ zZ_(CZe(+iAUb3e0lQeNhK{~GNjJgqVICu*lIm`2-Jukn^m8^i-_}RB>gjyOX9_G>i*2D4`-i94I)4DQoa*DyjDjvs{jh_nX4|+A_VOr`MJ#~YroJ8 zw_AJmzn*>n)f+kw#GfmXZ!|kw^Ppi>yt5RK#0wGc!h0S6ps5U-|Jufos}ePNf z{|!^)%`Wk%Jf@piGFqv+?9y4&7loqAp0_mqC`q&&-1 zn&rHG=(G0W`WDt^i(tyF8O!*(wGPTcE;f{ft(ppq9zDZM^|VHn4>|`s^+BD(VjOx! zJ{b6mYAtszr*f5gNZ(NU=Nk3T$QSaa3M=#l=4)5xw><>s!57>7@wT#RY8~Ikbzf=b z3GV!fZc}OyP(=A&`(l2xIeJ6bemd2lK`E~oMdz;ldcEy_NmF>ia!R2=YS+=M?%hHy zTy^3fe(XtPdR`a5t{kthYn8aiEAyWGm?ZnClbQWNI)%)n&*uB;^G(BIYqO5U_{{*l zxKf8Bzt_d)-=bqucc%HMb7^(B?RHc6&Fi^cPhNz-H`|EKV4gcvAT84r?(DsrBvRRy zT^$JE8vv*Z~?ijz8Kpu!03q(uNDi9z?WX zJ0IEsT7|j6m8y3Fv<0a-Wu8`9BXhYncgMQxCUX7s?H??!ZQK_N#0Rn^w5h(j&>5VwDrs(KmoUKj-vkzV57 zZkqZ5EUVdG%W736R+QD_yHKOuBVqFIl|hQN|IZigUPd9)+~HoHi~~~a4xG=cT+Yd{ zD-^xspp~JY>}+pkW3){aw)cZhM2V`{Dm}-|U1d7!N_Bxc|G2C3`umn26&6nBKRyy3 zb?spG^9et+ejj-db1t8`^R09h-rN1H*In`{?#8X3lP^!U#d2myqwU}REO%|LE}M2e zIWhODWXU1dw#D(*$+LHtsGi;e9{jyxG_&33oaacQO3)3v$&9P-#me%Z@_Vo3DLiwS zSXsRJblqp|$8?r;y}|ePpc|4f0+gNo0Y~Pj=faK0Z7(zNE{WzkD!q;v-9!u2uHG5( zkWZf**?7%&CXuhH4&5XP7y`L}BY$)9<#=n@eYfQu2#onZcYmp5Vn*qAj}QgBAB;^u-8B=MmYSl^+NM%fm3O~dqVDfzg|j`H@bQ~$2;S<{J5dfc=8O`*GFWKAV!L}q z;drfy)@*gG=Eqio^@n>40)AVoJJ|b2BmUCsLfx5W`7?`nOc_-s{kM+q=V(cLxSia5 z`SI?%rz@56YU{aN-mh(f1^3M?=Z1z`&ww4tI*Ub zRgOcaT;X8`#MvgA_kQ{vg&DZ&Q_aaFJVoO6yV|D~{A~40=1`1(uG=Cj{aeic?G~#ki=Z@*b!Q1Bwrh; zT1;@}q*PhhcDrdx;Es;+%xf3jTTLkX-+I-p1JJZ5$4j{LS-xa$3(ED!txW|WrTP;7 zKC6aD36)r6SU|FT(W?iW*t>jX^1S>!0D@Y&zVR2V_b&AoiRCx@{zcJYPc&Pe%0Ldk z-do`3h;!%5dbpQ3(r=V7DMr=5)E*FPBL2NvUs(%nj{YUXRN26g&IYY|q#km{-i#Gm z9r4}~(#GEA*mKe50Qd?A5+@j~w(Cegm;eH>5p78cdSHxTgH9YVo(;eJzGdC_B3zrYxB9s?nC z-`>~0MV3w zwIgko)N$!`qD5uItP1!OQaXb0$p+P>YuXMO+ zcI0lxxtsv+^$jwI$wtTAj;r@SkvlFA#ZwAC;h?Ck|0?<1$OvymJ?sAGw}6570AkUf z$4Su;NYylpwf?hBKj$3CFJrc^J{fY338`=udv+L*z44sF>!PxLQvgNJtd^*%Nbz_l zx&}SImooxJym1nr6A_=?yU~)+(qLl!woBMzGpgq8V*|;rLs7QL$r}v*33nvXDQ!mH z+KzZ!EgvgFn|`s6sSy!#pyN|!xrMu+GIdp}U=HWT@j`s#;6~8X?sMFzhr}D#sriE0 z%Ehb3UrH3z6uU^{qlhO@&UG6?6|7B80w9+~knL+A%>!e9<_qAreGJRTjTypH)6tzAjnjJCEV`WlY z!{DfJiTwa9QV!OofXJJs4!Xnlhmr$ir1P#m-Xo{E{($~V5-UzaDqVka5TV;cX}kLQ zB?~8HEd>7m*Mc=21*yIl5zuoPVtebidyKEX*H|MW5mbG+V7Xv&ui6l+++idG==DitDMo+@?jN4SNS^NZZb+)`Bxq|5}bcV8X_%uVADpZ19?NymQ zk#+v>;TVS_k11Dn^Faj8LhUt+-Y)Q#ut$I1!zOu4bXn>4K*4YKY!R`^mLK_+@%d&w z7$ObHk|>2A``#(oT#DLi)nsTCOhTV`UH_1uVM;4+RL0< zftp7?B1znIyS^GgJiPQwpW}j8~01hudNY%ah=9;UEjR; zxFhi%N26gw*piZFbio8g*|Q_cSA*!9{2K4H9B;gx#R{ZerB+GT?2(m&R)e z%ud-&_J;vAY41XBc8u7V|M=9jc7qD@CPwRGF#uKaQxZ1Q5#zch4LPslTw z?B($6RqmiWy`TC`cb1dHbcHx{#lD}%z>>mr*m{R@0q;#v<>fg!g6a171gteuS{7&Q z3>{IYTjjfJY*(#4yq~u9&0khGSQD}fJ$MB}kZzT3{&snID>!~YU8~3;mxEfl4%-}J zT3WE@+O0e`nOmtX34@~`!BVEr*}bdM!wrT;=CCrBl?f1qbYu+%4H6?q43y#O5tj1g zl9He3`a7|I8Eum=m9f0(?zjm@OA_1(XJ-s@*kD#&QFxjixkYh|5d|7+rK-bq?0>H6 zM!BB!c79m6WpQ;g|BAkx_X8aI%K3HSZ^>K58hMXuU3cHx#z=ai=2G(z7wcwW?M9!A z!@=y@Kg-bX$a@Vhi9CH< zywEFX={u!jfMTGW$Q?V=`TuBp>#(Zcc3pJREz;c$lMtj^lx{($bc#qL-65dTEg~&~ zNOyG+6@7v$H*1oQD_St8hwf~!X=^S&6@xD*o_x;>&Dg!Mq?F-DhcH_%b zfxSG79RUd(mM6jz_5)CILDz~Js{T~bD5%}r$r`exx$6cR7e#w^Oc|>H>v3)Dj!>N2 zpiC3z?W?nUjo{A-I30Me)k1Psd)9j2F8Vjw6jCRNG+aoG<((zM8m{u0EeF2+_!-AM zeB<#x?``SwvWFEs3q`BkmFnGv9op;m&0ii!-uf4Vy5zz)YIQCr%qGwX0n$H}w_d3L zkjjz}{n0sz_Q$<153j=Vg6r9JSS{7=?Of@I+>Nr%@s_7YU>5>GXGA7b7>P#VwEYlg zmF#?zE!NntiD=!%kq}<IUSjmD&bauiDga<_gsSu%w4P!pljSXE|nrTttGg z>FhTV+ocw&s?*>U3Zp=C*rLrsQ)*9Av6MwmLU!p2%k?g!`=l4tsRO5&1gJp9o0&&EWW>6lJvW5DU(+9bW}*Pba5NS0C(G>cl&HM?xBx7QR7 z&;SoT-bOBQaF$}01~$_~p`H?r&u}uiz`JD^D`ysI7$L9eq`F~)kj!kAPQGRWz+Pmh zIoy^{s;m>a4W1!g-O9>EQvPsS;}|9}Xfpkx-#!z&^~w_xhjxCTv@>zrwvw)YD!hS{ z7JfqqqzI9yH+K{!ss|lmNuwo>1ly-eUC3ePe&{gh%?3>nt6o*XDHvLYH^Gb6dp8^E z3f;x=K*ONdbNmDa?)u^AqzyCnw!vTJjhb|s7b5xpvL;ZYCF5u;H;2=E@u`00l{x=e zIq=K(6$<^c0Mi%8FCQNe)tOx&Oo=|F-~V}`j^eEmz>&oLa<2Ma$qPA3k+%KV+o4Jq zZ&w}RccZNH6t%dw2laNNd5$lQpzJADonM^cS;41s_dY7xlPbr5ma=WqHp0-Jr2icR z;O(8)UVFshs+B)~)0*N~%s$2Vy^O71H6cfHt4!#1KcnO^_Qa)8xmz?c2Ep*e{dFX} zcG0ptX8P40V|`p`rT3qRozZMdZh+aAtL5CQFuIB(t!DFZMqYhT=tsJh#po@&W) z@HL;MdU1OFNnpp@(^k3kl8*iBCC3y&d=}zh>|3)8n7=6kM5?SAxuN*{z7coB{&C~Q z9onhY2KQtf-T~V#iwQQjCi%cTK}lnW;DYag&Pp1XQ_#4eX(G@$29C;Tm9{hEYnpr_ z#daK~YV9BsCzg3DOP5iZ+}xA|noDvU$4!BJoaQep%7Frpba@rM3!?!a&NLoA+~m_v z)yj@r{w0tPsg$jLb|LS z6>rS|0BmdiT36X5I~d$9rWHm#Xq_t56V$D7z{4REh{Ysk-;J}d;28Syex|}v+-K;L zz%$9f)r@yG{deTvOpUL=o5!ycs@X_))Ls)zPB+3p(?ypJtie zKVhaf?DeX?xGt=Q)DKGB$L7B(K4Ol=v%9!?-%U%-?@ox zWc-Bh?I;8bvWtwI*KesL2`>`vwB94c>Q%peru}Oy`UT`JUJ$nD`gY>$XCiKCf?VZ` z+=8Kf0|g?>>Xhw=(bW?$+ycVw9~+kn3E5%P2F;D8azp9QC@Ujmmvb#Qx== zk!NTZ=REA&q*9I1!hBh<;s$q}dWYhjnGdc8LTpzgHB&|>Mjn^ zcicubAI zA6Bv5d68zcjau^J%71i+f3>B{-bWM+g7%~^1cZ)0jtttVh7fX@Rdw)8?UTv_%@;T& zQQ%FlT`^-Ll2QWASH61m5@E(R<}ilmfJ)Z3@t=C5gTOKG#Y0*91rObZ1ykms`p7kH zr`Cz4EcLveo&$A;(ENn_ z9d?Wi0=TXL=wyAWZwMK$-%PQ|3mDpgpJBbuslzINo*2uM2#DgUGv0^oORUzuL`0j6 z7EMPC?;RKQwS34cezGV=d|)!m$6=}XX$-x;OgPf zn33t3YjnfmLGUOLJiME!DwOx#7@+!0%t_u6MYFI+?|yaqbJt_Fs25{)@+~De5&8!} z7l+lv{c>Iv&eWv3`ZcOcm#xHV|2sOg7GuVyTATZXpfKKFZm8$KF8`slVjq6?L^X<> zKi@?wM1mo5vcXL=o>c?ez|w~|EmUY%pcfx)p2lgt@23}|UfB0^27Pe*cwDvff8>Zt z{8w`U8x14?7?Jyb1luDhCiW;j=p#W!8wO6X-~Ai~>Apn$HmAB$*o^@bT)Zx}dqrRD zH$_}*Z$mx^dg!^>+G7~MvMhYMP({cSLfPjo}%^}i7GKiL3jE{ zU^W65i%c%9nsR<#3lu--^0MZCmA>#~*4I5o-FHXuZ`lv|7`XyWLr3e~?CX55>jKT) zu=@f zO}A;OCrYFqy_>q0_48N)r9k>Y z-(>y>-6tqnL7pZXXN|yjY^wIb#4c$z-v@1de=w$kI83H3&(d^`++}J zzBrM^|3P~h(Y+s&!Y+UQF&vLt!5jg8dv%r#hMsB)xI2Ttl=JWj9()o4f(I5a3Nz{E z`c@ke=P2`6h6Z}|dbeU;se$H$+gtXf9-{JwPv7I6ZH#Iy#+SbwY;KoaeKw4CgThZ!ZrW0KCPCr~M0 z#bta;xrE6oAL#%;3z5k8M`4uoy7~Yts2EUDVDc%n(umdXks-S{jxBp9z!wSSv)}#g zMTPqjJ__6}X*;D>UHUj>Xl~blh%rU`Pjit@^ltsyPueNLUam6#RXUjvbs+!O=VHu0 z*{-4o=%nG&^j9695E!FbAOX^azduCJ!*vVx{J`FZTSIpE|8qn8e-p?-5Ln={>`Stp zEs3o29ZD8)y^0p``CDpl$*Q~!kXpQUxhC*ZVgf&Bn6t}A(q^_T34nDYJ&OKO6GWw$ zhf4S}h)VzJg7^>v_MV%tf@pEB6+Q{VAHyqqqk0g{E)@Izmw?#=RJsW|>IOv2*naE0 zC(j?DfKcQr%pre1cMNhflFNTe8gt>3^0k5H6j{%g!5a3dJ|y~A)T%PDlie7z?Eut2 zA>)i@ViZ{U4B_cCpqkh~vElW?jGZI+^I!iNAhf4gxZU50Us-PxewDO;%9QzyAFj3(1n)sa$tD?&A|M+vrrJZuG1&N|4xmq7w zQBUzVOawgi@qbEfnAm@{)c${5p&&)+`GS{SzDvkR!424yPX3cb4njsRB?VF?P}k3R zk^uyeDD2+|;1AlVNFHBbm~>e>NeO(HP2N>aDmyg+WI_>+f2YKU+=%N7vWY&!4j+Jf ze`Tg?Bb5yY64HSoPPZ|0qX1I_K(FwC~Vi_znL z*sZ2V&nZESW$Wz%PXVuU5Erc66GVYzaTNGI#@mW=uqV& zKy%UGe=ki?;J$_}%;n$>i6ca~z5+xepq{1F(#3LWj+P&ej4;68fyZ^7{yif2jL(GG z5kV1}B^R}HDgwAw;O3HHsg|yZJ+WO8g{Kg?$uRs ziKE2Y&!T5hNK41_YO^cGvzC_6e7*CzD57#cZpQ$v6V_T^LdxF^^}!4>fB= zE(+b7OFKXO``8fw-~N}hm%w|;dFeL_rZR#2C;ZT639pOYA7}TIxkoGZX&bc@>sMHf6<}S=p_b~RKpFn zXzah`?n8+0hRXO8*k6us`Shoixc0xBF1e#&*RF#mK((jYEXs5mydRDP7DuZn^Ksdah9bo)V&QJk6 zq<#%NPQhMP>9QjyuzB?P>}#J%msQ7qD0K!wwJ>SO!$q$Ya6uqQ$FG@T#_pl7Y=naS z=7EsXmj<#G^3&*D-{s8+@5hZ05w3`n9Ov26f$*;Xxa`nWp z$y}#pS60OE#(fuml3#Fiz{w9_YjvHk{!EpvW&Fj!e>}@87e<^qauO>k$#5rCYTA`m zm;8xvI@1L;rb$Ch=FYj~JXj(ey;@lfiBU-Mql@5!ff!(S}{67Cx&`pE? zkdsPlf`~Y2P+=^=ZyBwBWfUFmCDR4uIqFYc8B%Gby`#3?uXdYM`8TNodft-fn{-)F zlK^}DnQ!NKC2Jh46h>Iv!&-kfB*kh{|&E%4ZWGG<Fi}Dwq_!JvZeCZ$-M{&se76x+w zy0|_oHZ2>orU(=;6P3e!4AuVx&71$GplKp~j5IHLKD?$x7#F}j%VXwefb47xiF4?s z{ZLHMg`_c{xbA(h-YG{W83P2t#x+f%H6(cH z8M(t#9$Veg$R_tCG25w%Pr&1LN~;&NpU?2|xMB^J=ZCj&1 zM|hl77v-s^8@*UkMc_q+d~Y%-Yn@6f{|fwWYJqeE>ME93K8Bu&h*S4lkwG)NUD`x} zu7a@BI=&C46IthHG9{=(vBPvJH_xolMz4)Z?P*#y$)$wJ4_<`-Jtkr8zBcV5 z;>-}=eygm?g1Fo;;PO;>f~}^&75STnfP-x+Bb&Y+NrHFQKGNnwoktvKD7v`*5NIC7 z=4?JnA{v&k9$$zLoBPlfOSS(y?zl1(muhumjsBqeM!yGRt2dr3j1d3u4nAaF`Xbb| zFemc<@TD5UX)Ns4=p$HvGT$Axf&$Hx%!lK&5Rx96qcSZ|Q^p6_#9jyT-&4Y9!=k-+ zCpEsdC;lv1we9@F_dXGd++FXm>`ey60aELiC+q5^rc8N;w;_nP15XIko;rRzMRT~v&e&pY zQ8{}QlC6?bNhl;@yKRn|{sg7zwEl!ZMBbi_6?CqSSHEB;5zL4MEwl->%#tS1DCigc zW^ZvZTv8E?!Z+(jBjWDAbgiM-1`%Yo#`=OO^_h2|t%d4vhFCt0#}|8Vpk%4 zDzZOa8wB)eOpq$xiA>>gOWuY#&hZw=Z%5NEy3tSfJ`)i=iNBP z;1{*Qw{9-qo8lQ|%i6w%LX6dOA4kXAb^g{um-I97wJQ>`Fr z3;ca>V%SE1stg(L8}nZaH`s`|=(Ii;jTdY(A>kFaJ8!-YIPl0t>T5MiGbqOJ=Ez_a zNb}BGs{x*?&}ikQ0qWSt18`O_ziO@^MtrWg5c+*j$m9|tRT>8lr>%$fEsI|Zc7ruKLk&Gdejz$OM~Vhruk zE;Px6ot1ZswyTZs7YF~NM zfa*!Ea^5qAb0BDwJLD**H$G1M^To;CNZ937EVgoK`qLBbX*iGjc^?_#Z;Gcj%5aZj zV?CgLbMG`7`M%$voy(>7xGNjwWW6V?zjC`l#m#=3&Ou}1~=)nua3@Nd;h7}+Uq=Tw` z+gf7Ytd`s_G(of=&O|$v?jy*&T@-fj=X58_AvciMY=W8f=5L)4J{-575q9 zhZoRxu6DdHHs``v5pqKkTQhF37I@@N|LlI*uEm>>dFPLs>&|!tI2B7)CNwGw5e2K3 zB4@$?RG&Ma+t-jm+xeQkSVRNVp%2QG|6GSYCyo{a{;1oNrP;m9#`>Qwefp5ayZVHtY(%n3EVTZC1%o=i)FW^|c1LhneK!1h7LTe6)L1{k;qA`1=Pux$Fi9Z4MS8|Ei}66wDr?+TL3C2<+NJ>47{1%heBSjYSR z6ak(OsR|yGW4y-4$$YseyubH&Gp0)yV#&nmg5!F zLTB9LCwhIFY_40_98xi?N{iN`a+|+nkKdT`T&vUL{YIZD)>x`mts*~r8xO(ZkOn$d zEXx!5ff&B(mc4lFOx)u@08NHGVeM~ms7`=Ia_QB5V1Jblz4+2MK3EbM63nBo$gpds zGa?iE$h0ITOxl>jF~tUpk;b?ib7wIeyMsc&PSMQ+ul0kY8EL_f3})avvUA|5k5g#;8uOxv}U*x{rH3DXn1l9gZccF+V zmEXNcKrRf9#l~XHPVY1gaNgqkgW@VF{!N-)-@V#yB(lh;o_UdQI=hbq3i&%Py*I!2 znD>)w;tm4TZON%#d#B2@7W}-n9F~l%5q^nkbN0g8f2eR4R ziNP|XY$q}x_?q^rzdcN|Dx&S|*C<_piy*5>o#R4v*n-uydMMync>FX)$ zrHeExMQ(>zSV|rCPnTG}RTtn3`WiL74e;06y~lEh6n94WP*`Vi`G_>a7mus6e63sB zA)&$i=vhFYgJ4<^=ihHA;G6A!nml}n1vAU`zIzj9vP**# zjV(BiUZPC%#0EXd=V=wD9m|?I7UPP3{NTQUD9Wyx=6q#&;`Mdz6PTJjL+HK2kg)A1QeC}M1oSq;_C3)i?m@B)0kM3kI2@Zy z?29#*`6BLXQ&EbywURhpyfKR}hW!ZF3(DBUKzOif`% zyK5qDas2%50^jB8(9DoE+CaLDb?dL+1F%nUuJdn5U-Z{b4K+9Iq%LGIx}HvKS!yOa zf@Bo1Y4tf-6CTe1ijbzRvGNfc6YPufTs8NYEwU%fE&BMKN&BeEWgaq=q;6(VY}Pwq zTR5`Zp#(Zmy*xnu{38mDtDfsWu$T0`=VV$BZo5UpIpR2Y3Q!6ifH3@Lpi&O-8sNBy zOx0A8T4>}03W4WE|ESo(wvaY?a&I#Q0S-esJZzQmyxA;m_}(5tl2q6_>xL2e`D$~C zVgA<%HBbkJ;xjtmGY!1T!96g$!;IDFyf~OkQkZeapw!cTwvyC<#!BpxVQ3sH2oQ-M zI~_sSxafW#vHY4yAGpDHfExtw+YNPdw*Rn3eTg2MFiM@UD5ZoNa2Qy(lY4prUQ|=O zg!{knnAIo;X+2V4Bc|NOyDVY1jwXB7rCRRoMLQ5WlTX(lF0`p7?Vx%NT|X+ehDVC1 zX4VY(r|Jj~|GX|ig(qIO-G1W--W)}-2q-ChZ)cnR>{+9+btaRz^ci=2A`$wo&$pN| z&gagl$n3tCV#GR1?glh`<1DZ)#oArj9VUN_2NAL?njO9EI;bu5YZ))3>GACo@}u6V zjh7Yx%3`v9Q}vW)De0l^G6%>WWYH>eS9r6dmtPZmsV@XZ5odmtx%fRaJeMZSq~L{IqC8gkr*8^-??;Ku2*f< zOFWfq17bq$Oto!H2g-}7h`-sPxSOk=2oU#Tk~A1XFe>bOX@!lNZcI}@-e!5-{w9VS z-K**Qtw{XIl^-&tBTkyMCa2>zLh`>Msc(~LS<{Uo!7b1&)hFg`(G}w1Dg3sv=?8*( zw~62Idsq9rM|zfeL91aT1255Fj3>`?D_`*Tm<F*b%^a{qH;V>sn>o7?!K_R z-h_QlUpD7#I<*@S3p`@!E#w$j1d@Cb24S@u9W}drb(7 zMn2u0DOKBken)?qn#$FTf^6q7r5A~Z9VgKq)Bx5oxbZqxcMX@by|<1K##JhBR%`({{8AOjK0~vSEVQGeLu1qic34h%Q(Z#zvGVA`G;(gX|=7r zM-9oJ6cn<^*TOcJb27wr)cwjs$=&zo2IBx%JY~H2J`|F-bVpfKI>Ps%%g2~5ys>a; zn!>7!y0i518&PjkPgDO9nI>Y^??V1_bJq@}Za=AG|An=t{;$=hf@J}8AXpALtoQ`? zj*v691y2Jp+vM+RY9j8}u_Q?C88T=We6wk+Ug(1FhPBSEt1M%}|1PeRh2?M^QBRlb zccx93JWdy0;bsf#RQdXH>X@iK3uSz|ZR*+8HBFt;-MxABjN@ca^^^KXYUy+g=_i4` z*1xnOe!uvPX*1uskXxR^2kkDEsI9^A{-yy|LA(DHYB78H8vPFh5W~VjJEcul3 zRHMSXqG&9`9BYQTRQYQn+iuUvss;j>Bqdga(N`PT<%!v-?f?fP<8gPA4aO?pb|B++0#fdAX5_d-__e#}IbesfS9dQSDv*c1rnn-h}0E?Zv@h1jyB zjz1zWyxl9S99`?;LUe0m;2w7@(k=UZZZjh@wotTB$e3+tb7#rde`{T|3)lK#O2)2N zTOeh>BNXAACTS*7s8<`gEm+KbRO@tGD%170We|x~E{*c^(k;E-NueeU6mQ}lfX_LX z;yvF^eQG6^D(s@T$T~~awoGEu+E8oih@JE&d20K3JAuPF{vt2<%!aQDT_uj0v;XFr zd&-T37!Hp^L(da)-pqheP2{}}TXNzpHBOUsdz89->#uYq(Q-zHSZ}KG`RNBl_%1nE zH-Mms5&sr<;)(+_3>imea-UFKvYGqM?G3Ca#sLrVy|&yEBfdX@1NTK3b;P66YD~yn z{+PAA2!i)sz$&l_`csh1*FN|3IHEBrh|XSY$_yBZBkTBq@m(S*kBAFeRBzOZegGZK zUXQ}#j)nPy%y;P;fmXB1wEy?3A{+F4h=`6rUI$FU~RU-J$zJprDo3v!ca=8pY-Iv z*l{zD34H1J7unw&Cc9$jGL=u7PQz1R-<2V0_CpyTMd}#H1KJw}^$#DJ;=F%P6?E+q zuD3XF3ywuCOA>;92jzem3EU*&UOkCE z*jS6}A_Dc{FWj$Ie=1k37}b0c_d6GhAQ$=^i^@n8jr|3K!(vwkF_V**)fO+6=ZnqJZ8(?S5)V(#-p@`eW>Or`CRM@?PR?#T0Rw9*uwMq4^1S4b#U_ z4W{;+NO&0m^7^SY-riJcsG>IOi`tX5AB^x=^wm%=8yN5LR#~aY@W+Zed=r(KNH>mbR0p>nGZu)rvBSzSfj&jkh63f`tp2s3WZSyZ8P9j$w*reM#z0yQj*UTmO6!KtycfYW z1c?}_3gO4^M!s1S`{^1(V;Zi{J!h;)^=~>4{J*37kCoH~zp%HO_o@}DmyYah8BRDH zoE~%D-P8ccN9^cOWrL;dwDT772WC?#kHkV5Fkn7lm+?{~nRjh-6;4oL)+NBMo)KLW zhx%o<-cnGezY)9TK#B5bQbf?twDME})!jD0;ole8zvzEUW@4 z9DoU*fe7nGhhn_(#LW?kjoI=t;83&F!bzW?gp+VGeoK)i_IfA=Pb7%I{(xI%Kc zp(zL-I8FMu6Dpz&WEM<(hEacHXLW?}eQ%NnrxGSK1}`yta^&8xep;Colt8jh6hXCi zgth9ON{d}z>=C$)u5oBR4zueOr@RfjEs1f3(zCwGV&pUldn+b7W<9>{nWo6q5w;8b zt~{c)b%zDHmdm8bG9R%he*025fZLN5kdZ-awcv6v2*yXpjD3L`dJno|^|bq& zJladi*i8iOmUut}sK9ec@u&?5`u(Yrf@_@dNvbrGocmpl)cVQ{ElihH z5i^@_{;06Os`~=*lA{LOeb< z>2NEu3X6<_+A>V3kQpZ_!w>BW_)1q!>-}V9QY{+fk2gZbD9i&8iHW61YVbQvkw$T# zgpk7!J!dFX%0MZa6`@eSLbIY}8MVeF)DFEy!az2}nMm>F=TSDwxoNZJ2YrYl(1$R9 zp&-ng9dCZGiLKF|X}dHq*!>(56%ED@k5!jsG1 z;KR^qubj*#PHb11dFrxFOy=~P-lKH^i&bvkIP49jsMi&=K1)TR zf5DdP=y$5Xa(^02Jxr{v&UK;4$MgwSGz{Ti{Y7lzjs_|!=EHi`vjV$MlH&gIM6@M9~X?x$K01;OHqYkgZjMl z7Lu-{1648dzDMc*Ciaq6tbj8H)!z{XZnHF46yFV9ric4xicCkH${T z`Kj#6p+K~Eic5pj876m;2V>pz70oO|u4-IJ;TwZ;!z!ZCUtJA2j4Q#X!yQUCTURZ1 z#HNn*v5|8+U6vEmOH>VbjBOS|1$lcYr+csatoOpeXo7M6(e#s)rgKWiVTq8S2j>x+wPwYfW>v8pWOY4_3 zXTDZ>tW?q3NfJQ|zZ8=`@yf+-CM+x%s`_k&E5Ysfo?wpwPcpxM`?>q_yo(8UX5fajMCb`7`g}0C6m@KW95YPq!$ zZKUHpNk`=wP{>#fj0z)jtWV=uV|53=I6p?|Md3wC%)*AK>ABgc{6u=$YuHwD#RaR# zi2uY_yu=lhl+D5svX5nb_&Ux~1CGhmadsLzrSx`V901C1EP7%Y!u)UfSO(Sm zulO;%tlDk`hS8!8f66lDYd0tQ+sj!XMK?I_YqD#X5^5J1#$3ooDD)CTvl~UvHRpb~ zbg=vE(wMQmuj(7U8##}s*T#fz_@^P6(;u%y`w556y%#$R5)51g#x?}!Emb6@&!eRG7IxbAJKK!Fh{L(!(ivDS#gz_g zae?MKTEp+P5GEd&+8!h96&}&cx1mW zc~TM&C6`%b4F$mTwoLx;4KbQJQg=g43Fi4y%NPVjcKqcg!1I67Y2%%L3&FD zI-B5=7o;w^ihJx%Myt1r^FgD{a(<4PoV3ZDzb2e0BXkAvN%uB=myRApt6KciuRSQ_ zAJzG$?w5`gmOpOs;+6Z*QCJ;duDmSnZoEJ~m8JKYkd2|;bcRCuc#K15b16HNn9b%c z%qM!K-Lc8%Lw^_!{KDYzD@P^ zpU_c9qv{i$=`V`FO*gD%(^nPi2se2kTE~l+P2rpwKz%qq-_t2Zt#S^E0u3@ zzKeytuIY%RW~gfqXv{=7;xmU!xsZ>=)EMXBa4zb44pay))03A_Cw2@xgGGzkcZ&7s zu+og#zcMzr@v7CmRTn=CV7jsC{^G_Ht$*mSh(9nrnp|CSUDYoG{`H zVQf3D^*y=L%74a&Fru{1m5K99;Bmn~sSzJ^!ekD6E=x9y-OI`>0AaBbh0z}L+59$R z>iW8(z16vZ8{19G0>~%WrTc3lH$}$+(UiWzF5GYIk-=XudV_I|UH$*%d_++KU)2YI z?xgj#wSOXztD1D$Qv|%gERs$Ok^v$~RXG?`s%x5e{Q-2L9C#DMpjqcnEf@r2AcZ0J zK0tu1w`QRLj03{(Gw=CDim9)*7lBI~n8g62958cKc!~U7{ZOndEG~u?KMqX49dPX6u zH*Sj;?Y(!*??_;f>4{RRD9ac{Be{MHqv=(GeXX$CTnY+Bb{lGx= zdTp=zuS%b9&=|{K2+{flt>d&a@kAU9A8U<(vCJ2HF(#M7XMygV4C$EfxuRk=tJ19pmjV{!|@7!C~# zOR02k!oLs-8j78-;>lI@ww9W8MQS>u!d{oXxwc2977FyzU)k8-9^;6olaU27(wH`w z+ZbF7D~wH$d|_p?E*vSYH8DlnC8S ziiugk=%UUwGoXAi_3g|!XBm;Y>wM1-fYz%PpCilrp6{|5m<&p))!3pdrwGh8TxZ}s z02oXPU*i`o;0gYFF}nYnxXPewldW@7$b0{wHWdsE{7$6+WDX+&ETbK$O=7@{kn5Y| zU|RNaSV)onHb!)~!0mEJSUl8*Lis`Tc~%AkQ?%Sccnj;`M!`hzn2x`Bs_YO|#jUkW$mJZi>v+`lKLgmW15QuPE|h z#SZ!ETON4X7_sb0bbedHTSPcoFH9L8XIzu$OyB{dbYRRGa+TGqb9$5upf$MPK&$&X zLUT~28W#89=V~FN31L6?!63A*w_Kx2YW}PBZu^Hm#Tv6UN8kNg6Z4nwxZd9%f9yhI z`Ep^Z9((|00*Bsqtdla5y?GXEKX6Ik}(ik$vC<>8C5uYixN%nXnPm)0c z2P^B%fa^^Ky9N>CAcvSPEQ+BVcc;PKGv~{Fa9ij2 z&UbFlqKr_?xr}P`XU)PaoaJ|yu3I^y`@DoVdQy~*@wt7)rmL;fVcX2g@#^|+Z4nm+ zt_<^~@8x>p`3X`b9KXswqGZdc?Zq`fN0Ne(wiP~wu!|VBY{qq|_E5uXkJnFE%@aIv)sWS~Q!m#@ikp+U* z5n3_=Nf6bRKiw_H>*eAFt<^M0dX46_T6H0FJq;W~Kn?0nVlpm6nigMg-r!CnIKPeP z;^r-d8C9HoQTFNCqx#LU*>*2Y0FC}cB^udvdh}FYmf|*=-*dp@5rw)Wk8wpqo(VW? z33tC7EtG_J%IuaN#YbH-18*CD_5z^W`u7=wRMfi?D?515s}?>PY~{qRh@B@s>b_Ld zm5EP*kZ=%vs#>JLZvGDNm!;E>ljQCpl6L3mPrY0d3=XIfc zT(UP`9;TWm#6@6uXi6yh%H}5nfMc^Mh2J|Fd{Vr>-d5U*tG}P!aY3fO`QRCjvDOT% zEsF0CMA=_`YjhJvC*|0iElnLS*5il^>esEnzPq2l9<~<>8z-RkxNmBFYV}(P49`CXc9ezX zdUa=_QfKYu>)cvQUokvcHT?D<<83&!C31<>5e_s)LP5u~+ zPcv-tWb?Z0{74ogv3r%d`)g+x)xjF)F~%Q^$qNn{bH#g8@b zWQC>eVd*~xzoa$o+){A@bn6uqM&|`Vl9khPKx35QlllNUSb_|wzJyVua{~KBKJ=QT zShqp(sVaGNyk?F9(|AXTEUN!y#tzytIeE$D?9cb{;%csjnTgRw`gP9)ix#4HIL7jx zV-TzB&DfD=gZFrO~!Qp8bqMNyJqS;Khh5i1;Tr`N1e7_)s}Lt{1W zA+e<&Az`Gdl;YBtSR1*|74V1Ju5mjM$*le^_TnPfv=IpaYG)yG6KK%HI~JY-XMAgx zr59)L6&6A!b~VNOnH9*P>9C0PxQ!V6R#|G}&ga^l)*;Z?L;JgYX*2?w)Xvt++|9!! znX)n*DMAjqCm;Jeg9yY1Nk6W){Merl0F8qa5G_c~(NPR^>ZAy@i%sSV9) zB9h8E9U%|abMs40YdqgLlRjQl&2VcGK|adi4k%UV2Yr*v*4R|ON-2;-wgK(4x!8(i z`U$d#{U;J*9vHWFzuP_AxrMul%#4L>|!`~xoZ{eeYA`+ zUFJbB*Wk$D>{P>^xlg$6iTH~#+#vFVaU?V78YNaIUq3-JN0oCT!`DE_FThW<hW&_N!(=9X$v)EFfZOEU3myM!%ae;Lf`!OB0Y{3|XN7-(*m&n_?UBPn+P;2ykI zVm{NCN`b*-h@IJKA;mOJ3x-#I(>BaTlT(;NuO?%8?VGd0$EImmC6|RDkGM-gdfi=P@G#7qm{V$@xiyNevz1 z!swp3w$YDHmX?+XWX`(bsSyu;6*ho=hr~MK%+ok<&x#gw5Ul%yfS0SBCe`)#b~UWH zDMkQ6gjOwWS0<8L{F%@74LiXw1nzDa<9icUT3hHQrUqywD75D4w48e_6~UwJHd<1r zhtrvD`tA)75=nHYH#T@6G6IkMIN2vHMJKL42&Z}6q%=}WYHNF^TQ3ovV@`10GZE(b zp&o5d^e#^rx4v;+2%F3yC6 z7a}9~@(TeDGG@UI6bqQXzf;+$ctu1SdX*G!8e6!*V~6yT!2~dy; zzcNYYlS!V+X8_STjyZDWvazkV`hXoLXJbq zWJ(ew)dE(Vf2hWI10vENzXSno+Bh6bEJ5(QUtivm^Ln4v&m;(Z{A~EuyhB8wISM=v zO#lhi@b7z+x6K&Q*lutKD{PJl{58do!H2+QwL^r|AHO+O(hE4-!R0G=CWzJF@nxGl z+MlaUayaZ&gx z_v?3rS3UmSlQe=^fhN%nc6_aKybxZ0=(%WQ2lva8^A~qZ!&l3-VHGC*s6!`HBbKIT zVegH{-oh7><1EeZ7f&n)9)<-`m>Cv;C2Xuuc8p&;&dywhwY1iEEj)Jptdia`$T;=1 zpZu(Jv-h3mFu@lIbz)qICK9IEy{Xi4R?cV43RyA}wROOT{F$3^+iFQe`cT&Bar?vh zOE*Oi{-LP!GrRayG1!Tt|IOc;9M@vUZXs9A+LPeeA#bKs+wFSuEd{58JU(<*TP|A2GlGoW#oX%FLnmjcd!6Rdj>>0W~?lDXI9H!o=zgqH*$K zRI68&&QIM`wC#?L!s&Bcz0H<@+lTopcEd78e%o<+WJa^Yr+)L4$?XC4nL;)u-e03~ z76-wIMm*7e32}=Qeq2L|mTxPPIV`O;P*`pLMyuFYrsoS=oR!Z*LG2`A(`{VmvXeN9 z*4siLlEA5xrkKRVLEJu5WoOZwZR-r?$9lf8@u`uzWnh!>DHi{ zK7@Ib%DEE9mmXxg<(>d=ei9hwg`0aZ!A02M>in1IEzkZhny$jF$?xrNq|(jkl#m`Zkdy(V8$r6Iq&plUJzxx^ zqy!9FQaVIRk&;%ryFvPWzQ5~w|AO;eXJ_X=_viiur+LV08&;VZ5PUI;A?)Gbg03q9 zz<#!r^j1P<%K9C3<3RMa)+%*S_~F4f#vQTl9>|py|E!`FWh!@wXM2yI#9zOT&?EiX zKYwyq>g1~fm!1XtKSRlMYypmk9u()0aW+B=6YA> z^K!cJ8Y6~kN6G~IN>#s|$s!-~-oxXFo=`wsmVWik5-+h31u%&(y>ivdB1n$&1+O z0_@lc5Gv*ngKxjY`71<>zsu^%k4|-q3-?mE8dntp?z~>PTKws`2#)P1$4N*h+*{k@ z)d8h_S~__iM7YKFXN~{H+CXM?!dkd5b6=6RO&9xok97*lzvX#iwp&|>aULmZruv1L zxKQYuKf3($8u@IIGK8NyBxk610F_M7oXGZHd6^{q0+o=50ZAsDu>mpOYd=q_r;jTE zUJU?(&>)E$Bjor1l|Whot;%!!!M9CVm=u~*y-m{5%A#ED&S7Di=Zmto+pBTVyI#0_ zr}%Ng!k1~N( z5OBu7uAEPkf34i=#Ljw8E10tZjZ@wVGJDAHuY4!$6e&SERUq(mX30W;AEDB;7+UwF z|5bYVy)i17kNK?a0fn$8r!((jxVn`9$2>5vDz%S-V!Hu5QY)2uIQt z$hZivr@ z^y9sZUilu!m%s#S5hnFaOztug-Pg47CqPpwMi-!aR zn7Ra4kC!dvowh-Rgh%Z_^vQ6kIHGs*VdLGmjz0&}r8|u=e5`bP zO!L6e|5h7wjxGJ(qLYzizW$I|zam&`i^EecT!|s7VRt?8-QQ>IEZ~|m@P0xK!Eyum z<@M2i^bt2o^YCx?8uW@*<^)n?D??Aixx$3!qdX!lyMTWXZW9T7{25Q|!o^59Yk^$h zo@x_-Wh06K;q&qq{)*+Qtid|JKI!|P0AP9F#-(sYJg&&7k;7SiYVbmbW$8BMBsw^; zT5{VGI*E7o@}!p-V3N~(%~_Km2rjxSUXp;7YrJ@-2&_J?Hm(J^S`6wur|cT;ewi)j zUg}0_@I{wRa3uuOd-}IyF^n`}Q@A~-e+65EDH8DWwK+|P8mR|4^_Va*V~uWf62q=G z7bRdH(%y?BqL0Xa(4TxC4eVmxmLH>(;6WIp85^|UgsNgk%6s4rusxh0Wg}?B#yDx^ zJwh^GpWCt%5cBXi9#!Gp>jP7(khM549T(Y1bqj^e-fZyM59+$KiYMxC1v!kFXe56j z2=UZk6xo1rFCxFh%MO+udv6SJyz~(kRyApNd)V^MRKWPhBlWkKuJ3>_aEmO`sFf^j zbpZ2lrB4_wrzP&%raTkKPQD)}0_#FDM=~}(iKwTxeEkbYC(^!=ib?(@%49OF%7oYO zs<5^PEcX0jSpMd#ilwbEI9uFI!o?OBBPm>--5+TG-G8MN5KO;b(%~TrjbHeH-9)_u>h9$;ImRI(xDVE@+#%Wc8?q2`F+Lj}6d-+_Q+qBKQq^|nS|O{)u1NbiBEkzW{weg{*!x-{ zk|&Fqinnp7g>0)?gzfnW_<9YS$FmXkJ?hm5;h#Wz@!3TN9z-b7&@m%yN=Hmlg7w-J zdyZj8P2Uc+7QDL|)s)`p)y@r&_2D>btCR`*EF-Fv;i|C}Po}MWT%q7JD)* zKMpyBpC73E4sV4!u}Z1FK9yvjjd;Rz5VFXkjevN2zUd?{c^aiYrb;@j<}QE-EhL#7 z%jjWMdDXJHtf5RQWriGlhrtSX)6!zcJ<*3WkWqzUHMG^Ysht0|nixVSDC zdaWU823=+3E}(y)fYWy}EOVP75(p?X+-dRSiKVS9y-ibJ(Ji)~91y%B=G>@*-V#_` zojtQ6|5C5tNi|~gR(ed>X&hGbQHi#@zoi4xVoG@Xc!ha{03dxNf1aAM3?G>D3Q+Y! z;eQaWT>0&mpi?*7UTO(>?Xj29)->EX{8BLL(`X=m? zoX7u}N7<#^Qka=S(WoNaYrb$;SoRQk4rP&CvxMuD#hG{aL0UCBzr0}q{NyW7F4P>0 zpMT0-r~et05vTWc!7jNum|ZQf%kTfa01{4Cg9<~xdcF^7_3*gVa<}-@%5%|#@C_`i znJ!|yjNOQS(xz>}q$GpZbkVj~mmX20GIbcX9|Z0Jc2;=b*0 zh)}oxIm;hiY`Q0OUqK&ES;mNJonTv|_Z)Plk6FClv@|=5(iL%UW~#5tfVhX{hBbZ_ zC68`Al%2;OI1SU!6IB84y8MLs7n*DM8b3Haun*xB)wWw;rO)g*b?;@JWS5Z-y}#lU z+L6Y^e-PzU_=@~?xV2Lj+JP>-$BHA8Rhc*FjoY)&cDs$Jl{se_tMkZ68k!zD`3511 zl~wI^TiuLU{kb;tS+^QvIgFyqf5FnH^37?qhZKR=AWACIYsdW_^ztF3(`L(0?S60r zX~&4kv_f+nzTJ&x(F|E={C=XzAiwqbY*7)QG#gD&=SyFBq->4Ld4I{`iyTY5wEIJ~fIybR*!vVcoHpyB-6~(pDz8<&F_MjvO7)LSIa$?zY^J*7Ku%bvCa2z4kEEqt z8-r_@9da)h?$_a*q5*=`LBClZ@UJiqYAEdW0|#;CZ)eb`2vu`J`>?1{^|HL={jTNy!Qd~wDXNcOaP4sVH@HmrFwR+~i3C&*nq|b!>$Ff3J#Ao{ z#oEBN{{b1C#hUwpJ&>VxI2ds(muUt@DLtc+UUbPi zcisR68JB2lA)7JF48PYFMjo~K;keHebIa!T->5PI+% zU`{bBF@Bl3WM^bMH9&6LjdPTHCig#mB zoOm;sH42Rj&38tlj(@TjtNbc5no%BoglWvYQGO4X3j~$v^A=z`(62Q);~*T_`j?Y> zun9w2*JoUbimuRDfke1b^CD>qGyn6;rR~+y0It0$*WFz%=^8u z6iM&f=Jo>mpO$}1GV280A1827S^6DZj&#tdL+*s_^Fq}HM9}s6Jz;%aO{d@3F_8}^ zu?KWyMy+VRpM}YGcdGZ&_RbkZbcc;!^2x7sDr4U(FqD)5YSNmW zQQAGgtb8$2PBJKvA<^^{)s?^)Vz$)in-=(-Hp45b=agU|XY{<`e3{Isdv5YvWkz1Y zX;?{UO|Za?#dP;;hLaac<`pDY*qXVF@R9<`(*9{wTC$7*-t<&kFYz2;4EqfO-gj!pNFJ06~URB?lJPf`gRsh({jqp`>98gV|?U zkS}un?k`=+dU1)iVbzAz!mr666b*zJe2Vw7iwkMIup{9Az4b0`b!p=X_JEZBH(1cx z8Z96Hw&#ZIL*+gXAdUH@3U|@{O1XsL*hky%P1p0<%^gQ4!*YAl0bA|ZoJFFiJJ+B0 zgEVm-4db8iAHLbN&Ed-|`}yqi;yy-hJ7Tr6u!zDYIpX1i2%FReI8ATrW~}}C8CQk{ z=Z3s?rw^t0=8-u$HP))NN4Jn%K5z4Pjr}|P<`O|9x#!Jvb$0!VXVCeVx0(=n?Zn8v zSwGLdgb2T90?deccD(ORJ{zTXQK2L6^#wLPXLRvz@nz|BM?PrKN&>cAlAedcO!!3y z#4877P!?f)b8g|I@2T4QE5e{RcgcBbaOgQNhBCB!jyVHlg4A$OIejebHc^tj1(> z{$~#6VWtfMfs{4aMz#Ol)y@n>uB3$xR^?X}a%IdD4Y{P*QY0P_I1)##A+(OGk~d4Adg+WTc<-BW+lvn(I?LF0e5jWPK{g6*(-$ zVe|&x;;`szB?feSIk@~}RHyW(w35UnMQ(V!O08v(pTXWroop5!yCT#d*O%Pa^OX3! zpf3V1Ec3{&1{P3h_0@8Z1ZGkA`x%ugD)oEiNea@ckk55OTC4Y$f%vib)yrF8n_YXY z^2~OX-}a=7c+5*~4+h;l+@C`R;W{Nnq!p~WU_Gxvb_syMmkiY6raZoKoybqZj*12qf=#W1wpR9?+0uy1s z9?WPlpV|Q8#1hXCx>l?q3NKF5fz4w1g5RRL+%%%?DE$?N!X_rBUi8W-K&RFsRvGaN zy!JK01Jn*g!RG7Qg@!XubVxO-1D>W=j4k~VU2?xs1IUfB4;R!AWGy*r>57nV%d?7r zpY3Mp)Y(=wSfe&|1b$&gIlnW<|#b|83%u)Js!^8 z2fG)lBDP?p(oNF+Sj@frV|J;QjA#`3o4TOETK-VEYqw~6 zVO__gzZzfd={a9Xxu%D#Z`rWtM^d+=pC%*r;@^_|_KhgE`xdJh&p@@oW;+;%H!v^z z9#yF6eNe2gT3y9Q{)bCfrMJ+3?>O|&unh?4>2pe1sQpSZ?D~=xd!UPRH)1u0CA>e^Z_>0~0oqG? z>bsut-#bDvms?}|u4awoR{(nO>8k=*wXj5ut`kvszz=&SN*4`#FgagUtJqUwLpbx{ zEsp7pJnt6}S`$84;r{2v8v!KnplwbrAB@=Z=rG{mwF=xfwb-=c$(w*3TN4Eb|04@X zg^P$t%mw?n<_aZKd7J`1dpF|w7Ui9j$uB)suX7OVNx94;RM~Y>o}}Cua-p`;=*oFg z+0S}~eyiCBO=Xev)WUyLjZeFnEyS}4(6J)vhSL`dQ+mW+kQ6n(KAHQ|!}t41E&Jq3 zPjM@JPSz8l@{(hSkCzKWe#z?h_6HI<1^@Pnl25(6^MfqeWG@tH5?!mB#$^9_dL$+x zoS3p)^3Rh*(K#FsPp35mWuC2foS5%OxXT`WzxnC0^{0h)W^OL%qRnOZd7H}zhC{eK zE;Q_#SQlG%$TNjC%Z)FLIjDawXt-_L^E3JBG2`j}AD!XI)F+_m%gkY69{EQ}58NF* zj+5oD6xLVSUEC5ho%bwQuRQqCp%c^@+akpt8{#l;WBa8&*aPpB@3WKgX@7rlji==9<%ATx0XVpK(`B#GyjBN%(gxtwW;&I#B=bBkjd3^ra8Fj&4ib4X0* zjMSquy*KAUBdPvW$Fra{e~e^Ug({wQ(KUO(K)4VIrPIDdOev*o&PGHSv)No=Di)2e zHtSqpq3c3;-yB75YfDJuGo644phJa2Q=U2HhBbs}=V-Q5bDRhqyQPxkh|StlE?>4* z+%j!@;vn!wz6`OIZvXWe4&aUj%jK;+g`zcQNA#c@HOwz&u2iBPlv8`!TCgu-DI`Zy zDlj;y*U)<%v~sf(ffP$uyJ|>7{6tPeSm}z+MRkY`oEpvGEQ`!nKmyWT9=~2rhE!nd zxYD0x2oVCWl%gVvjhRRH(|(r75~ceVKBwI{%jqiU*|Y5?^G;#s=ULq)`Ep%>#EA9m z5xdNUk_#>Hh97(w3|+k8__0iaZZ${VX$U3ha^sy}b3NTvquYoR?mi@iU|ClSU>&p5 zQ$T*2guhm($`X~I@G}k*Ii{4QD+BPBX`=oHxQiR32#CM)>7vEuF?F4e%=Mt>+}BK5 ziv+dcKc-fQZJUTFYCdC3&|c^OT13By5{MsPk(#=F3;+C8-w9Q*?_4d!=r=C1kLR^s z@I`PnoP@SdX8+Q`%LLys-|}naQ%B#z6QRaHRpau_Yf&_#K+5U?P_#Q)tf?NL1QRMA zFvx>yyw^B5{|wnqRC(oa{{;WRDrkYKETUegN|80bJ|!mQq$G}yB+D1?5X1*Knv%~8 zC#oi;kzVEu!;*A*{m|AA&y4rCx`=s`Xz*FTR=V;ehBLFmFB;T|{C3f0^SnVFLGPF=6 z#Sd0wAUf<{cCVji!!vwj?G0Ajy~ymePZgj(LMJj{R-9OX4N~0ryg(!g7uK0_i|>Th z@U6R=5IxvNEog`++VU!vGIe_mC+K?&U>s%~i=TZDyBsW1%e=7w|BEZ_+*(iebvn+S z3AUJ6pXk3VCyYK6J_ILRJBnAV&JFL(XMFJy;cAS#o>yHnsh4m3u#*Tn*CT(VrEIu= zM0E`}AKQLiy;DgJ{wcD7VTyBOE++P1HEnBLFd@sjK2v};Thu!KvW;V1ciQ||UUXaD zVQ0+B;#ipE%O*UPzVAmx<`9ODSXV5p#aPKryngzl!^AQod}nY++Hp8<4;YWH8<#&% zsb`(3+WmIZ0?cg(FI=XW&9P=kFnz6Zy|Ool-IYNn>%l(!g(7bX$3Hwb8Pr#mSsT=8 zQ{3tIb#ZOol%p(A>p~}q4j!|lZ!hQ6nZnJhhY4{HvxqoTUtbs;NvPO-|LlD}LoD`| zsswuEjmKl#$?nHpf>6`(&jAV3bn~2VY2KLI0gMC!ns`0}Ff*EmdqK#Q-Et*LoQW|$ zj~4gmQ*k)Y8BdkI~y$3JY*ABYlT z;;4d8eoYx2Ig(0MGOY|5kBSaSx3nW`YbOw@f6y2Ka>Tl%3@ITkKl)D8v`Ary3~m12 zR{7|Tv5YnAHf{XGN{oZbLxF&~Fi@Xj^j(ry+DVbqnjajxL2Mv`W-PClLCe4kofstC zK1WB}i+`~O0wDsQuh_gC2N!hxzAJz_oG@|P$$TTz7NcGfPS882v(M>8CzEIO>}$Du z80omR_dV(GI8gmdY>R$5{*N)>B(Y*#m+T?tUrRm+WGV9{QegZ0O>!n4D|tgGxG4xls|{orVgm zj2y_}y#C7Hi7E54Br7F+QtyLT|KjN5TJN6k9TE_*#qFvEl%H*Du2lig@{(%ijt1V| zDg&{1%60rAY~^NOwSg?RM|qoqNc#f1l-AIV2>Y=a;y~}%(=*z$tazLOxO7b$mi7bC zhnD^R2)B#51!W!lP|zyYTjm3K-NXjSuf~r(2{ffe_@k)m27xXF1tF(o+c9+NIr|7b zLI0v8K*%KN{AQ%}E>7?;TP4Upm7fVxB|RS&BX?mg8D$3|PJRICHeGJs@faBn=(!JK zy(EoWHs`m)KBSf%|7~WFqSi33c+Y(tSN&gL%*)+rFE2%CSkC5i)VyTu3Yp4S|LdIC zS$o+412eS`I~N zW7RkH-=9@VtPyJe*SdX%+{R0!FH_=0vl=fkM})_*VZ9lv+>h+Z;_t%WpPH|+rh}8%x;G(kz zWr_tIWlLC1*JaqNv&CDqBiOW^;48tbSwR%Brs|7aDa%MM_OX&kO+>lFfFSfa2<+$C zfwW)utKqWd=MK*JTp9?V#JwrLBb|4T1N7gi(kgTcXC(+VZnBHC<`*IJg6}zm&&Ca0 zT~9A-$If?4wv;(3!&JBg^sfu=NA6$>=DtS`+1@!mo*muamo`p+?=(^>e+9SJWIVPG z4={~cMt0N7Ssnph@4sbHFLj2Go*zp#sYh|G74zAlB%jy{mDK{&39MHMWRzmCHv?I- z{8rz2-1LR<>f*^JRp)Jxf=Esa2D3!X6dw$OZ`vue7U036??C@G9kJIhTu_+9~bt3y+4SbLiVTBmjO6UC+9_%^fGQH*lzG<&yaAI|BD6o{c9( z|H^paef2oRGF5T;M_FY`zcYBic50=bHU$+)+Mt=IGzn7hsnJ><|3#0Tn=3#(@DKU7 zH|erUI+5zJiS9-z;MTVTL1_tD9sxE#U~F?B{$=HM*D4?Ac<;-wE?Fu4MlPzidgHm0 z{f12PZBkdW7OfulDAXW+DR@7&;6CYtAck5s13uCdbAFo4R^%?zjI^im-sLs2EKED3 zcZ|4+gW&2~s^zLRv$byX#wkD(M8`O>XyOV5tSryl$k4I+T42svRVZ^vU>_}7p#?}_ z?U(yeD<1YUtI!brSRmMgUKZAQ4<*JlInk^Ip4h-TW0A*iD56B7d*W%;&<{AVjZH@i z#A+H<@OC@&+e3qJ&zKP^a-h-IxCEu_qOa~`=2}ouAj^0B$G>@jww2$uomR+}s;>q= zgYpX>t0|F~P*R{GBV~N)=*Oo$j77%4d{J?;%(V9JhV5c!W=tsJ{tbIHg?id`bzg1) zIfDPiG%dy_C0dm_Fz$z~SGu+cI>$U150mKhrVD}T2*GtsXQwa2;dXkv)*JF4X53R9c^7}Q1&%Q6 ztm=Pik1M`ePa9l`QuJ&SL;`Oy4w3mKoQ$$aOdvv8V@5o^w#aA(|V1lTq;_-l zM3WYyDP662oJwz`$tk2_3OxiQOdUf~n+ZsAMz@Gzdn+uJ<=sHD77}THEKCe>O6;mt zxZj1}el|Ad(Q8yc*XdtsdBLgA%2{d=B=h;IrMwz3W`Io$U%tB-1ScX!!B*-xGd{w<>8C z*H88hv8w^#(?}CX0#p-{aqZ{+T_O~u*(p{aJlW<|1yq(DtaF8)rfo|vOPk7#%U(LH z!+!d!Io?!UVa-WRj=|r|i16!QlHy~@n?AE^n|9!V3d8SkJdW(@-lW@nc zL4ha`gc?iXGv8kCeQK1mCg=lGWJ>-J^pF$pnD;frbwKE{hNNQphYqya8hPPp#^D4Zj#jz#dQ@L_c zfSOiLZ%5ubL>SZdn}54D=~3|JFX6l!Bt>v(fOsREU9SXiV}efo8AAi3y&gC}X1&a` zO9mFXzvxKD?Ilk!*NmX1JCQ-(Ok`L!KeJ@!b$@OuiVwnkYP+a>st6SY%H6KT?$nP- zW~9CTC02cZl4`Jp8^5_yIe>)!W-(1Ju(EhmkP~K2_im)ZG?_o=zT#spmnZVY7GR0I zfwbRhadW^!+!+=y(_xMN8KCH#9==`d^zDjLIHbV3wKNW)njsuntL{1^s;(#1`vnZ> zTq(X}Y&--CQKx-f!lznp?1AZhl?R#*f8Dv78HcIus7@9DFC1Tf&IE-BEZkG6zAa#C z4#TRpWIaB&01vO{m?M9$GpAS@dAwXyq*vLD8rx>GZwA z%+E|sWRKVx)OlnkoZT3xrr_q{3sE0ZPr1!H8aFoO6Za8;=en zG)e$)7wr|%SNx4Hce`q5`l_AZ$H|?kC8E?VY6Py}J<)OQz1}+A0{MzKAVCQ7K;aVb zA?yziYi!C<|Mh_D&&J-@KEhs+zm4$6HrI%BHk1M@8BhpDd7`9dV5AqQ>A+q*CpVUOe1WV+jiYj_UhX3XFgj6lrI zH%VpP?pdM{5vP-NvV8&q5Mb{t{YV}r;@pTC@AlEg9w@0d3Xim>A^-`ebj>sC)Z(UL zveB}xRcT`U%sh$s7`?}cBd7%t#e$G!p-Lp>qg!6k>`CFV0(9Pm$BZA%%hRT#maThk z`&}cIXU9D2qV=~F+6=LYjqi*R^SWWZLw(n4zrzB{oubRy=}BT z(1$$`h*Ogs_X)O)E9KkIde-PRd;0t=UPo09s6dZd^cqYT1EblA40Q-&P1-}VRKm$b zG}Ul0rf!2GtXhkj1Wuv0OkX}ABheTi;;Xs02@y%l<58iC zL(p_9TY(28)#^CNjom$5J|qN>Vc8Xb+w@(K4T$#&dWGvmiQBm>h9lGSC6WyfEmLrB zx35wXdE1F_4+G6S;;h;IkizkI9^)5@D|v0+W5%kgzIg8MZUMd|CF^HKQxWMy6u30Q zb%+a&*fE6tjNfxq;m@b`WyNu`@iD`&9e(j`;XV)sUtUV#AW~VT6DKf9j~Snr==Wbr z!jMAcJZclD=6bH=-4kPjTpL!~o64{s53|o9q`~qRG(q`Uq3!^4DjRZSNiU_(UT68+ zH_mTZMx)61g%D94L}yC_G86axWUL`zYx0DJH7k}>NZ`yzhcx3B{iGx>nvDXRO{I_# z&42Bm5UIo~@|t2o^GE?2{nax0R1b{;m#l>4PAtN$O7%ZvtNc_}O|s5J0oSz|!RIAO z0!)bi@9{JUnnMw)vUZ1ILu!g3?JE=Ah~QpWItce2a5iJf-t&7&bk6VjQ!msZ#Mo?M z+?cowrn^|sm<%VY{Uq`w#f$h$tO(?vOz0--Op#B$W5Hq&mfUe@4tub)vq6xYXLFMo zBOl0dyel*0mbhXZLW01Xes4A}&}Wktc9J+HPAwbEpCVH(gm8+#qj%p0p4;G85ErnB zj3gwVZbMDuey57Tzqri2(Yl>}YqWxz^Jmk{6S_~l`aJSC)}%qkfE%do;!xT^>iulU zKmM7Oh7_!XN6WC;yYmhXj7a*5jOHZCB)nO*X0W1j`|GM)+)%Hs<$Iu&nqv@1CE~1A zSvL*0SY%J)w3mL$m6Q6Q-rKJuB7gfwdOiKX4v1dnd&v zsY%Za!Ql`33G9JgyC92%^lcezM5RdyrgA~}%I?NVhMnT4*-z!9|L7~c&89+2Zxjf0 zPpR0uOc_KQs&%-@2SAW@C*V=TeY*82pY7U2-DSrq4-+!`(faKJ38I_M4+{?N;XY>I z0cgN)53uttuhvpH9gCN)OYL&uISKR16x9*sjy%M&(8@#?404#%)OokR0_FC5>$ua$pv6 zOQru+_uVn25PexO?MaH1^LY_PK!2uoF5^oCGD+mHlkW>xIJS+b0J)#C6&}$2q>Z;g zNkwkN`1Zh!?PwEFh;Y4}{mZ-OYPZ^ANBGdm6lTZ<3Dm~30`;d@mn)loV;Wo8$*CPg z064fefC44lxk@S=WSo-iCpkbm^BiX9A7vLivBk0_o5fQj?AiHduxYv3dwH;!O`N_zyEn$u){AGp@TP-(O?K(i}u-DEgaDXw%(f>3XoDRa=^ru0%)F2xB5|rcQCC{7P)tqCDN0Od?c1E z+W8;7@i^@|fX-2(Lqo{5a0ttO@Uvyn$CtmKwFW-S4GH4fj$0uiY2N%^uI7B*A8Y?Q z2wCk$+@15y{3(^tg{k>u@kZXO_mz+AehC5RY;<*+RxMfU&yqr4j;QM-t@OD_IFz}l zNL7VXUTD#b?T{KX`{O~7PqU+(ZNX**SEZ=7gVoM6h;Bp3 z)jP@&*V_F@W(UbWioMX~%tr}=Z!-ocKuAlge2=-4}-(Nn{^Bm1rOH82i{S-4^nY z0!iY!I#R7ex-Su*qP=iF(5_u@iag(EB9}_|b}{~?s|I0GT=HD^J1j|Bk4yJUsbR%q zf&p=YDuYz}A-(@vADEi_8z;Uz-(pr}FDG!49@W7h)V6rhQa3mz?tMg*Ogu+PVq-w- zx41*Rky>lH6;$o`^wu1A2iO=rORqHG#ht@?JYykS0Y2TmH*6kn&xXeK1V}#uwD1lo zg6!xKOETFRyYw*{pynha0wWN0$clWF~#RYp&m+Vzf5zm5PzVN8gw%%x9Cd>}NSO z4_yGTe_0bG-ZWWw9vd$?Q76YDznb|-bnC)4_ zkYc(*dQGw_Q zHu0sww#PeQIP7BQrU}0K6n^Xs)y5*7$;Ltx4TA8nbfSsqKQH9^dJBk1+*j)PK^(U8 zygsTN3R`K>c8}269+o_-QXC4?-~^32w>_difQx{>KmW(wJgoUY!f^KB2S{yRSa73C zE|_D4JToRP6Fc3Fa$1v$nGE-VX0Z&fEr)@b-RF?c0ag=zp3h->r_$TjU7-jUBUHT% zw*AAVjmkfM2^2zCQ`iGHdq<1i1emOihzWbDuIaN$29FnbApIL4?K*;fcrr-P}ZArV_-fh6d=HR3V@&N!S=Adw? z9~+O2PZsy&JlcU3X44sf1~&2nBBVpxPlf{L%Kina86+q&e)_P9A>upMp@)|bT4>aV9?~TfgmG+2>pF6a71!c#K6l!LLalT^@&2R+8Z7Kt_Bu>50QDPRCI`S1GO>g?AH{EH8I8__!I;MbodGEE) z!z@CVge1j#TU6IkLIa*$O)_+tXKPXxeKRwdwf*-g)d3%c#oedQs^~<9vOFa?^a?CN z?~A8!JkFac7CinZA$@!jDSF=6qlXO($=u|E|HZEP6#Ft^4zCZ__Z2ixnFiRo3c}uj zz@PN5&s2!j2z99Vtp9m##SBVPuFX&9r5#1qgBw0%;9%9Bzwmb_tR@fRb$+9oVDMcH zqr4SJchUZ=U{k6bU7ftC@1_AO!}?II3O-ZcHGP_^K<9%joZkPJcSb}a30wTa75jEV zkPe|j)4#}MN-9DFdiF4BWYfASol;xIlue{Lo3?s4!hV*?;tzft7K0OZVf+=IvA}BJHDn&)tPb@>y8V_+sR_b9_c(2SQlP2}IBsCtx7zfDrzqucZ& ze0=6!saXbmP_G$*yS&rTGpwSjk#-Wqx@GKio|Ij5QphO-`+C-=ksS_z>eK4@i~iSo zwZeMLa|*xV%!Hwn_ow7J9-GO#2EUkRVXkR{ZB;6qsiP52;7`%c2OZoU_X<$!@yYW( z#n{UXzZaBo%XfA^S)}ntB^#gRNCk0?(msyQ1G&AruDFLOK%xAPQ&x+M`nn!|K^f%H z_EQA`d~l$Ip2xgul|bK+dc9drj9AqbGibLcj#wRs=P$6*MPTyEe9YM{T zedFLk;=04~3L*&>g?S%e32V`=iGM#~UZdrh-zemN|K`2#myzz;IX-?DzJb?d%Cu1# zr|xhM8?%l1j>{`oBk!HOp!B@te&l~m-?53_wqCmFc#SZKlkBWlUZ1V{P=RfIp?5(- zU3%h(AQRl%lRz7w8XvRs8h4>oQnp4RV%4V#v;scg?T(^_@#^V89w}%Q1?f(knCFCg z3y2S)qs4MD0FRdTv5JL$CXftqWZUB>M0fRY=pj!(ycJtow^B`#5ao2wF`FGla*O=T z+vBQn*HPW~F4$cX}n0~SDe9TE82g;7>?PF29rv}ZzsVs%wHV1@`bO1=C#-M_? zH=Xj`HwEuvG3a&ni^?rqnA9p-si(2^omuPV6FTf4{kG>&i+-q(%{0sug$;QM9vD_X z)fPPhGI}W2-bCOi=~OTG{GAnd6~~liqDJA)Hn1+p+;X(BO9227^rsBFwn8(yD;bKOux5qy%tSw6B>Ful>(KM$_A7Bm zsTZ*lcHU|r+T$v$ws`toxZEGLf*ob&`DR6Em_GVdo+u->h+#_lsYBd&*FP0$y~5kp z*V;(zIWsKfkt#v*NXlH|%tRZsY*t(*$kIju_XRjUu=3M@PeKI6?IQjt_B^fK)F-S~ zun=Rd1#0~41Brg!{*NAo{SZX&9dKTbRcWL@7C77`X4(NoLiW^% ziBWThJ9tb9boYH4_jsskt4GZ;D{HMm3q0dWlDM65SCylus0}#Y253%xyx^(?QX!&A zY4IaF;rvqf)B!1a%V6hbztr^fUaL+O+OhE*P(FXU5>5$r`^2FX`LaW|KG-jD)~r3W z3I0<&BWOh!cK~wUaqC}+6?;nx)BXJjJ4&r1UX<_R=d^+`8%U7%qt|?KQv6%cfUBV+ ze5LvoyAQE*Yl;(glYHlsk`Av}*_ch^puGP`f|_+St^^>?5*@!V&J<4&LnUF1P=k>z z9~UX-$sgx~HtsKnPb~tpvcv|o(qm!ga7Prs$!Bzu9e@ITP2zw%%X0Vn%mNY)1BdYYf=Au;=T=`oFr{TzLdxq> zZ7?ITSc9U=M?Yx>t5&7Tb<@Yzb?v4O-V8OzeTIRKscknuE!5&|e6@r_oqvL-`x8*s zpiguDeC0qIU5rS+tR`KC6bXLhf6@n^3Ll(KP3op9jO$V}qC_3O)&#K5T+P+u9&hv{ zfe<*HE15)|N>h!y$9&?=*;4Ux$nct&@va-ZP5io|sWvuOrLjL)B0O;W4uNaCXKCRi z5PA;jzuV$v;`KXhF=@{>cve;6PGMkKnXf62K(AMkx zDGEd#If9Yvm+UUoAw)w?r=zm*{dYxBZPA5{RB=#P?1~hAk|OL2895}019 zViNwWR|2eb!s_%c>*S295CceR79}zpKth#rtzbgh#WKG)!uYkOSmx;f(zjar@si;;xvtF@ps-1|&G)dabiQZWT^xU!t5qfTBEw?bu=|f( z2Kq0$Nj1bnD5M33#r1|@ByKnEk%K}MdV+RFz5qlz^E#rMxrMo=H!H;*m1*UY8=Sqr zH<+VS>g*Q0UHmBQ!5^Lzm_PUAB}|-*Zs>#%?Vu62k|b;CXWHPH$>*-T?dow5G@BPb zgSdL!ncJ>?3wtdBFnUppUj85#W~Q2zk1!NTMF~XYEOO`r5X|TYjs36G<8tag85*0{k27RFW~s6z^HpX_k-_ z8J5$qp(qlSSGIC5`JrruGR=_xfYCW_|JF~~$D|+FO{-&9?J0h|I{@^1i8K`u0UgyB47!JbzW1Ud_ zVEVya5;wgQE$bl+B5yOYY~`9MBM>qi)2ta$l4yy;5D9SikjJxL&yP&vuGaQHl@06( zXTL64jA&FQdXR=y(|>H=VE4X>b3SM^teySx0Z3ZnqLCFb}s+5 zg@8T7pvTj%aTlA857%t(#Y3s<0gv zi;#!<$CWXANrx6;edJ_zw|+-O8c}7v$$ZJkv^u4Eo5;B*N0<{uZ(v;SI#J4)yE_%p&+(I46^Ex90&&(-e3w~v+C)=*9 zd9@nY$j~{aP!c>lA&YaS@HA9R=T%Uxd;gBbnN?cz5&XIQiYv&uL16plOWZJ~!~4k9 zy~VWMM{AaLK;c1yr@f-+#PRrkTS4pKY<=Iejf?|K_`At*-TO`VrMHxmnDq*3W8}*E zS7_b69cps{EX$OoldxYON1So7)b9UUCTVEazVo=T99QyHu&i3Y^7D-hx@!vkp^wjC z({w?`8;!5S?cZr~;>9#0a4tTXAn&0`?gkt5dbJBXNXD^JX1sZD@8IU#7xvH3cKhP{ z+>m~he!orkw_;r!V%HXa9?xk0XzyDPugev`#YGb}@3)#%NqIM5FQD(cvD=KSeV6B; zd5nIDW&4*0GDJJ&Yooh3F#}sbRU|)dAhKg=Nb;iw_nFcj;KjcDU*e2FCzmBx$x8(Y*Lm*10t$mdNe zzcZa)PT^%M@zYh5fG%Na>RshNFTAZJHV@ohV)DalA_B9%MXmvbA&gi1{Z zNv|f^Jz$=^`%J0p#QmT_IkCLAb>ilDspJadq< zse`>HllJmnw#rS4N*3OOt$D9hvfmLDu07Ol6Pb+nIm$9=%ktR~iAY@I=2xd?gf~ey zq7FTK34-6rYd=j_?s)drVB^bE9|^AZ-P?mGa$79;b(Oo@@>k0gxmU5_;g%>-6gUw0zeZ$)Js7X)NY_HDCAg8yc#C{!plG?viLYXT7^QFJTLxr zaJWHN{T?;95($T%Z2Ol_nG7BW-TCTlN0StMCx6oS=~ka;BKv7jT^^z4W-Hi)zvo^u zfSli+KKll(r;8%Sf;Al1IKu)vk_3oDyhHdio+)vO+S&Hd9X>Qs@MSi?p3D7-(XX=t z*SX)Ng7^%EN^|4@Qg{uY<6e=g3NRMVwEn1cMlh@T-FIM>A=EIc=Xpx<5dv?Wh@a*i zXLHkMAEhg^-nqJ-qYnSRYCL3QmG+}4(lUcKNN0IqGl6wGcd&KjYJ{M<*`WiySRZ#E zne#KUgS6EoGYW+NFPAjqo$?OP#Y%q@5JK{P-|k6Een}xa^Dw5+7Zk~CCO=$>i zSdfu;|8R%w?gvUn$C1auj|Cnlk<;C|8-PV@@R;g>@D26#G1sdLIs%)zVXcc+)?t6_ z&zZpz3rZj~!KO^AyrwPuqvSM^;Qa_4S{s}NN99p0t^N7xVwM%y1lkYQR2FDL9`#_S zJ#a6*B~8854R;H~J!4g_IVCKUkS2Fg9(KG$ORxE6izDCs@Ilo54+hkJ`xVqNY+&g^rxj~Gs%Wg)NP*KpfK~8@Fz8k zG|7ENu~SJgX9N5esMQEFvn-z1@z#tKLwV<|-@NP-*x()MKKX$^PXGJI2Lv(AZnhHQ z4&*z1;${9z)#y=i2fYE4ZID<}UiGnT;rDHjWpjH1D@L_;e@mE^Pu#&}=A(sX92LDl zC0l22)@?=>TTsr>Y5L%l0H-s0aNOA-_A5%$zO3n#$)G9mtG8jS6nndnC{)x}=Qx!m z-+i|vw;VZHZHChc8`;;`>1Do#(>ZUW3|_3oz>8Ha#w_$3MUDr~vPZE$xijVsGnQf$ zV^H)-%GA$nNe)4-SwAO!@{wAJb_eABnZ`)}Yj@;vMOHfv^6sG_O0#qm>-qN|v zNF^IFWqg~lokU^O2WXdNGBXrh)W=SuAGELDndlqFyFJLpv6-BN2Li;?>f2q(qsj&JPz*#&2fUsC1-Fb}DxyKlL z0P|mJ*$1gZTB2UyZ((7NF#An~@_3RlssY@4F6XNPqp>JI|0O$Ux&P8`RXX^pO5ewC z$xd!#rvPvv(5JtTyHs?8i&`BZV=B7sHqaj~M_QF8nRkv&S*FN6rE_&kUN4xZ<(Io( zo8Yt$W)q4l%@Cm7%8RGR;+qlJuZfr_H>u(yx9nQEa zC|w9|lFQz46#~8PZObH7g$;1wXR!y@j2{{zw2qesBo(kvl80 zKNX{>3LpL_6g8-x_vTMdu#eWe`z^+K7=bXSw}DpFZG)B{Dc0b{trm<;1pU#R;TQX% zx=@><>%Xk+!Bj{hAC;XO*E(lN`u+7%f7LduBLIszMT*+V_Dlp^F&cM@E$)5g2_~NyVG;O6zjegl>wap_?)wK2ckr(J?5?uVJ zjt!WpYf(>cEk#)4VBNW3Q_ug#ve@Vgm1F$^#>>MVNqxO8Ix0jDiu)>^BWXpZl`MT6 zQaxQ=)GalRI9e}PGQi!fW|X{g%eTVs%rd0f28or+kosetION&UrhdlDK)o;bm{j7A zLdFZ^hnbBH-Lzi08pr2uzYk zuq}{Lg7Ok4$Y5HE#nBJIBYc6@SwfW>WNl{U#zNRfUK%ow=rV7LUlr?A<_rCj*B~Qdj9A#*{<-43eO!)&NTO@U180l|vRo0bbNC~z! zB?z{pV|&1Ls~Z)3%<6O-tn%2HHk0$;i-Amp61K!P9elTo@{({^nW#47)o3TRX0{HY zt4n}1g?1LO6f<;X%QtX3CkF2|6*uE_qR);Ti*zdY^(8RY;cFg_^jodA9JTJrO3RUm z*a%1}sJIv}F>1aOb-UunZxJSxZ+@Ue3NF^~*OeyucQ*En=g!VHi%GDG^q_;{4*B8n z{)<5Q#6yJaNsO>MK?{9X(Dijh<3vGa{n_PDBGo8-?2CIc$!00yv$oClL+3B0%!JA`?1)gc7cU`r+dTayC7Ld9v(?%{jN@i6z@H zh(4m_%MMtc?1%e%-zn047Lp9y6AN5jaQNQ!_j}dF^rTOUJFJ){k?{awbC?3q{cV)a z_r%}RwTm&Z!z?t*Qsgwj<8hl8~c+f!qe<^v1qhRklk`l)KD^{v>nFM^-v1wXY6-kecD3EsM41*)f? z+hL0xPjMdIz!Ii9O}InTw<#DRiv)uObxSS433r`H(Z&}TXebjk@(j}s!`(Z~^JG3) zAM8u@ez#jjzh*6q>s3g-`YVP@6(3YY=-R+q32BALAH^8oR22n_BhH<#&`^BTlEU z)hUAApiV2`W(iaF{cnEuY*ZVKw3QCaa^aNg!+Eh(LM|!(OHoIvIGqnIOW+)+gT4>^ zx6$X{p01xAQ$fzsw9=@{W}`Q;pU9`7_^; z@vDFjqqdNXq#EDt;5ouQ-=bJ1tET2qeitg}Rk|#TUP0ON-6CT^KM9CBB zKuDRkcs%KuUZDs+nSF10vk8b?{o-Tlw73J_v20Zcu=5{90x5*Y!S{i%`x-zUDy4z# zN}67v0bi$?IEaF?;eW7X7?{{>kbHeG0l=I?Q*)jz%6lNfk8}Z?)yy$CiRZ0^LNTxb z*p`$D!x}Iz6sE8_W+}JH48w}yyD3Y{dH2-XzjGv?ZheE#YD+GjmF6Po5r7xe55uDB zUqn%t;e}S~;dFwm?3!jVL%i3K*2JAx-}HUzK|X^Z)vJz9E6{&Yy&kp z(3mk20*gMZ^Ys4}fC{i>bgOxijrtbjLeP`eRF=?Ly2uj_wwM0iHJJU(U z|7QwlaS~R5q6S7~fm*#k{r@L2|KAV(7x?_|761QS)nzpNme_TxF4hrqD}Rz{_%zB) zUhiZ9FTi?grOF1e0YFKL&cyFu|@> z(uh7?O^-OuHXq1*^5&7M^95)xA}EvX@o>C+v``y#k6!IfMV##2pKfewo>rEji?xW; zb_Iy4(DL3anF;*_^jFzNFfgZlz@h(p90K6|=FRom2z^{e)aF=UrPahB)b7*dh_QOg z7eO#PGuI`>$;ldmqtOIclk*4VUt3t6YC(hbt7)D&Dhad!`*82`<8PcHa=!w68za@; zL}6GvyBzKh)T(CTED#;mQ8*( z^!ymVDC<@59h;Ngd5C4C0bY!;uo8H=!L-1D(@Fe0WDocWw=*QYP3Q&W=wHux(8CPj z?fSqE@%A_Iv(6{L-*NcwfqmMKZQC>q-H{eLtw^qNQ}UC^){}43->FoHm1@SC_V{rR zpq670n}rrvAmzckr*1uAN@Dg3#dSR+Lbd)>&&rW^spTkc8jNkuB zWdC6F-~T1D&+|11m#IGqe^^F&cKsDSyc92OFuEh zxA+perh$pQdPDzGW=PB~@@RWoQE>6P>rg5mOayfcDOv75^kHlfNo%t(bszDH zNTh<{k^A;!$^YuZqa)C=81O0!A2x>0jrbN$&8BHwU7Zs2eN1%{x$mVv zNtp#=2I-lwO`Qs-b2>Y5ep|)_h)33%)|jDdI2)rj9_`v7LS6JCL{CuFtpFD^&U!X0 z!na_mS>F~tUmq=8vu|%KQ_F#xT@AWTu>}V3ELke=Z5G2Vs=}P8c^!eD(8&7NPjDZ0 z-Fi`qVG^qJwp3%qE3N}Cx;=gt0g7Hmo&~Z$N&Rs>Oc!((tNID%K3hoVDy9m;z&JMG z{~@uoFF~$uJm1@t?(>Us3Z%axs>Yk~&4*bZ9GCp97XYoj@c0yo_FpQVJ}J$b&NN5#&<`5EEn~P(#t{j z{0;<3VV13KSVaRop(&k~Sx`kr@80Nl4Fa0BUw(cBzh$8rscz4_(eZV76fY(flcV8z zvDc3HWuK*2dp-$s@m(&atzCVuJxLD;S&{Sk{1 z1rFkSVC3@F!T}nFuGd6#F2~)%a<@#ytqN*I+~jA*gXLez{DOlxDu6nEi9)hVW=wB3}&V;=kE{>`#~R}VTibuQ#<`_3`rQz#X#s@&Oy$EEC}U(kU6j!pBvugWs|E zXKZPHKJ}OnB-5)s-kx?(NCk8KToTx{1&_Ql7pLRBQ^c{wicUHSJ)ELEJsO9QAsaI0 zPHXN7X{)z(;?HJEZYr7?2EN6f{d3hTGP$gb8bYz*E>yaLF2t#%OM2{IKfv41Wb@1H zA&kpS`gmylK>VclG z(+wX@e%*N5|5VIxo32K*zq9@{eM}#XN2ougI|XZIrAu4ZOob-!nZNTzZ4D`XWOPWd z5kZu;)1*glU32#&gSl3@=E}Q>cskh^Xmp)$zl#z&s+f%TW_URrO2+BHS7kNd70yg6n zCQ|b{BSLY1xj%uw+#fJ0{8PQ4^&>PHv>t0N+d&M!-l;x4ScTnpoki$Bq0TU&oHA*v z>4L@ZtP_5bPU8el`laMp$)39npMa(ysLK!>?4C6S9(M7?kbn>crvde8U3th7;)Rq8 zYgxtcsPF6T03uO4*I94qx%*KcGHG;~xiPuAfv3s5x z)=Z;OBUd$ihF+?s;ScxvZw1$kLZL1^5s>-gCzVP%jn5{W$5Hc9Sm;3S7<^%_##Fb9 zClKue(j?OO!;e0wyEzd>x>t5=@x^wF?UBUvCrJek!^FsyS{oO23$;D=knO3eHzxCx ze3orb`3h&BL7nDR5N7E70?4GYjF!YcV4EIp!OoPmhp;-O1J7$Isgt(uzPYKW8GR2Bo2UvA<#2hv@~$ysBo`;eBhp z(|rwwWi3Y=6C>Vxr)D;lVRV{2^=&+Gq=%;ByN@;JBgoqhy0CpSK9hcNzKv}@{oq=` z8pmFd*+-Z6Up%cD%uV-t8Z8N}k&9y$cofrp+JhBEb(g;>yA92Hh9Uh);=H4SG0t@G z?<&U+84-hk(<@BWiSMr;OC&o}@iFUsR%QrszN2W*)XHq{5&a1H;uK$A zaqJshox?qwAE|hTcI8vgj1G-|{4iq9c{;Ch^Ey277Xu&{ZZb*&(lq#2X(q>rc$uS> zWshk0#hYw$*~>Q;lFsre{Lk^3i_a!fC)VUO3vAdm3*A;wnzGp$E~Ca^9NkWiQX-N$ zJoRVl$v45=>TtNV6Buf!Q6lr%ZJ9OBRIKx*P{EC@25ecZ+yJP7Pq+6myzj+esq&&D zkF!nFC!h27^qQSf89GH*RTd=&Dn9fI$48GZ1&%Jyr0h27bP;^zaC_rR?l$W)=U8M> zf2I_HW{!L!jq81ls2;+oNiZ9B26EVN4kO^GX`yj9QBYt)8ukjc+&v@n6t5?qmLB>ihx|>bi)kN#zd1 z*zPus^YbM3N2j*hpSR=umr5$NdXF$PGoXUFDt(}K{>hAqc)nUy!*Qg3UUiZLbOwNA zR->=4JYv2!WGE-zB!AQIOm(9`rwFw1Mkd<8(L^_VxRqsqAAL_YWCpyIvaytYJ|;e) ztiP}WecX0I5JP+Jy2q9D;$zI_EF7*|VIGZ4m2WAOQbDt8X7+MDAyRopN^JXF_+jQI zE^>&OoK(P;H(tJ*fokWuZt=|t+k|>Km368^p)*~l^aIagNuv0t+A3L%9Cf=dT~C)2 zELlf87MsCnWpO{f>?sFyQ(U7oJmoPH&Nyl<6*ybS!>@K8|;iP&+z_gc`b*9vO0tpmNfJg?mV zQJKxL;`@nzRBsp>n5b&2_>plMOxwl@jqLcGn%19hbaOJ@a*tzU&8vzKySxrnXoGOV zPk!$qBo_Qsq~`VwJdttx-yeMNx1Ox){F8`|+X}6F*?thKl(&40a5apUYI-)S0nr!g zWq*_4wL|)~X{n@{RUqYi60h~fF@S!vt38A_w$7J`#ljkZ9Vh->nsABFl8YLi;dT~2T=C&L{8bURb09o;yL2Ah@o+UM`s3D9g_ReY5X3BV5ZIfm4eDo zhZN1s&0SHVeZKq7CNF6l9pnVj47wY;DI6*ak$%*lrU~FuMu(Fzi%$#;zaW(aW(Bv+ z0{0XjkW&*es`AaY76h360G1C4Hsl!G01@pA7X7vJHA&_-INLSagD1PfIoR7GiTxU& zJzaz1l!{E9%Rz74nk}odB#P{ZAnMxV?Bag9lg*=g$yHRtE84?iH}9Tv{Y-e?^cyGd z#9sl&bFfHP&Xi_PjjX<%dNX0EoV|@*pM*7AW+W5e#$3bNtGzS!J`z+P?8(fgB$oms zenWSQcYp53NVl=nEy|M_C7P4~&u~}TwlpV`d6ZxbA~M~xGgL4NUy%Zz+Rm>QS+hC) znkDstmQpxgbtgn<6a`bpU%cKch#&W4^Nfg^@eVGS2Uh9*;a-P2=7?6`(^(dZ{(D3} zAK+uhoX;S5#gAbkcU?eELmq`&-CAv6%tPxS-SQ8WCN&p-qEA^3l&9ib*hn6eQ}G=< z@qf_Dl2!w)6s?laJQo{w6a*`+krgaLP(LIGIv0 zrar+6?D(!ma2IdUcpy_swdkdUBIx!va5piYRh`vvrxi!XN0q;=W?X)FFI19ckV7&j zM~BOv1_;qd@wH42Xm+A{*UwlyKX4O^`EGWaMQ6O=?^`j~-g1BFvCEPUSFw|?;8D&w zM7jNFSJE92^OIPY%pR-~7ua}R`BMddkJ*k**gs-3^N(xFhjpsodkpVw-9`OoCtBj4vv)@?>Oa}K22({yh9@M#NlTYYGKCVIYgAL=WW z&OPe6^k()en0VAK=^LLXxJIzfFF&^hs>D=ub&M_eTn_6YgkxUS9~iGCI6nh zu$6kVj#{enitX+aaeT=u*&BCnL}A^$sXGqEl91I35PwD_#Y_C)n#6;++Q)dZN znFq3rPmKuP&@+B!5`T8{&AtA2YF?rVt5-15tAnCa+dqjtJ*h}RKB$f3*}>}^Ph?MG zzM;Bis^Dz?hQU=KHPW#C>fHCL552DB>#$h_H8|Elj=|B}wh_Jn33s9qTLgLIC+SzG zbf{IatBE480kj7l;Fm|mzRp)C?PAlc1%)x^wG(72rF-pSmjemgla|G)OzfxvX@krC z1UP^GHXJ<#U%o_d5m548?gopUl<8X*RdSWX40#4Ki~>A*iL=&sd{9DT1)7{=P}|vQ zK@&rInsF`n<+yUF13c&bBK+zcj`+$3SsT?d4xe`KXxPR4WTp*Ybk~2YXu8?moa}j>I3HO3QswOR92T|S+Z4JtHMj9-}e-um7 zA<>yfXg>I9_zszdzVpcB(y|~64O>w~%vHi>C=Z=tE1n0W;cP?Mo9U9kS_TYEv~>8s z^8%L&el}8kAp&i}qk3Qy%d_uKB$<9Ui-mXB*Ieu?E&O7c6J1X_#A*JMfrJWv(#lzv z-dk^Y*qENRt!)%Y@-@j$-)M2i_o|jk*sWv}I)LSReQiKO48#!qG>Z$ZhDX%N5G^YSsAliKA4cn$_J% zkDYqd#rWz%D5ncbK4}S3x1qylF>E7M9zTr==@vd(T10+2=(VRqk3T_%af>DDQ5xSn z{C!Z)c64A4y;{RT9L=5@gyk(YoNtBYB3`rjjBi&QZ~(W}_nY3?`+J9jpyd{)!J+n; z>z~SLlG*A>>yoCR=PNO1NIJH?X1W>s0?)b2^J%kQc>i|8s=@1Q&ej%rm}}X2B&WgB z9nqz?sO4fG1@$q3!#1@A-ptXR$i8hAYB(45q3mkZr9>X>USEMQMLDPHxAE)A?tU zPy)bf36Q%ZLC|}f@r?-P@?e!s6Y;D{<~EC<;xY*d?(myc6BG`2G9M4wcFTK$7qg6h zPcN_rd~EZkM1PY!nM5$c%qWz-%5Jl;JgxS!b~Z9;oK?M?P4$O2uJm?A{kgg+cDItD z;RBxMHE6>Mr6g<-RewWu?7encZ%NZnsH-?qMx0jLhSLwO17r%Ya({HWqy9|dHM0Im2;-Z zuhO>GUCjHB_ay;Ytu>TNqoenH{Y|9ES;_fE0_SloGgjyG`=Q*3ON~THZp#&JWT_0T z1NzK*>3GJrf{M#&YOAk%`>5$WxB3nIc%8jkHgs-kdT5UBKTa-t;dD}@1u1sk}lSIg}i1wG)b{jT$=)s zb40XBI`;^2S2^Q6PFq7c3guT47a*_|O4o%iELGNT7$ORdKyWDbQ8`Jc3VFDC9VPE= z#%_n_QBRMafol!ZfG*P`PA)h%`VJu#UW_UGKtK5X{$X%^;>PJziR20}ai}?6Nv?%o z9N8ivp2(s?`5D;{GZ1-y4h_G#{tly82w<&dk^aH71=mTTPZZ_~LbJIKcDQk?HRGS2 z!l+P9I5f3!>>#z{8(R$DwB0MQCYMN&CDNs8y< z5w1-^QE;{G>@E&;KpO4oHPeAr&)JcbS7uV&#=#Th5z2u49Z`xJho9Zbh zrxH4Trnz3|;WcqB-Ouv++4-UN2#xRWmRr9BxGgj*)FkKAlJ|O?P)Ei33wVT+)80px z2~yj7333fJf;7dNlew0~XVS}Qmz6@$vag=)G_GGSJDJ3;5}RVlHj!zyzFWsrz7^+76RR~w%}~x)rKfv+e30tG z!S-qWz4gm^MZwdGF1RYIwfdgEjOx9hR5^~TGv68#&9&tQV0CM}k0)}smp-;c^H`C~ z+NhY|#biIDysOn5|s@~erq7|4hMdm5jepBpH`ViV1mMvqp2z?xw#c;Nd@ z*Of%*KGtO}_M_Fl^wn{$_z|DxAVSob47I=vjpxO6XeV>Bw8&Xz?S9)%*}YRZYb22e zsEbB_Wu@&tjD%k`6Vg4;EVJ0Mg0{}FXVKmqPIb@P`o46ybU(6|3ho==dmSh8zMd`m z2w^7X8a?IF@s`N2mOEU^H@Oa+>Q{?5_GewX{~$+heO}mOu4j#WcMY;#J=7~gSwigV z=H284Ph~CW6m#kNh-6?%)l-9@Mo^b$3GnL9Sfu@!o^y6Fey%z*sa4g)kKb&uPeK#9 z(;!xs=by!-UiWuQK;kJl zbsdW;*|_Wa@~(%w?Dcg-xTlq?^+V*ZV=iJcdxKJCpnP9NM8Y(!gz=r^q)pKX?lgX!cF|eM9 zPL}j71K_U`ZM9K*h3LYucReQeP;v${MTLyMfYA~e$L^qZ+_Fd3SQfh=(|f~3=Nh}Z z>Yj^|&vYLZl0dEsP3Aa#FGG`sP$Z}>4rbfqn78|UFAY)UP_o)=5_&C9 ztA^Hh(2*!>L^|u`J58FA^OGH$%epaEdWWyY2lyHqFN>97kvnGtM2H|X7%Wc9HEJ#P z-lTPP3(>6et|rR2X7&n+vYD)q@zS4ln`4eMOR=3JRb%E**R#fFY|p)fcJcO$l=_|? zTwPB5`6d4ns|r%Ga4W(kq54ms?{UPG*(@4y8FyK3Yr1UHw`DkA_edMo!<@}8NslOIQ=AzEdcU zGs3gbLCO1KVWVhaVmstLJuFZT*fRcq+p~ZF`!BnerQA*+emT+Svyws9chd6F1|&kn zZ-9J34{+(&eW2IgP9?MuXI|6Ux4WB@|0UNh$9c-)g8Ke7Bd z>6_X45Nn#sR+UGi5JOWTUGe?eorcW_(G{($us?BPC1r!0ps~Y`Rn}c796jjvMY0E*$D;?hNoSc)5*3C z>)t*yn_uuHY?#eV?;FBCyd>1!!TxjF{&}31>~`^z|JkdW*SeEX7@WpGqW z=f|I^by1xA?eM3z+Aveosi6(icxd6Y+j2)&xMswbx9`BK%kHjUytX3SGlNl-H1GFQ z*U25}xWAaJ^=HJ9{7L5N8O)Rl^%Cl{bs8BcgdF&c;>C`oX^oZF4K#e%eqd^l&UZ-C^r8AK&-+{m0i(D={zLH``VlOh51`k%L^LB$K2F- zsz+IY($w0`LDG8l%EHOQRi5BH%J}AQ@+1O4L6EQZ_mWY<>g1QS8k43s2iP-?;ulOV zyLm`9I4E42hI2bgzAk;^S;@;s1k-ypO3N_q!}VbiyzT4SThDN3 zc5c}f7gmny_-vJQbJSM%c_STd-Yq9K6% zBmOK^o++a_PJg#

PDNbWo15SBVxWB$R>C!v=&Mm*<1TDV48eN$xOMAZ8j1jf{XFGLZfr4lI19Ch!j z)VPJ)_JHepJcI-FpeAHFO8$$bA=uQPuf})G4Nz}1{BgwMFEUsh|L3xS$1-z=jo6($ zuv&#xwdsB}KGYP&tw@>8F6CPyPnh-V{Hy?Qb`@B%-8Z*U0lC^=J^cAZBWAdm2WfM@ z2o2kSBR&nrcazRZKislGhn(L8IO`8@@uw={3hX`RDDBN=V~H`$ZSSlx9fYf6^L_?J!Cpq#3>B<7vV3$bpem%YrhO{(AD+FF~_ zSb4YyfS)u-({aSYr3^Wipcoh5JUs4As5#40Q4A0tJ;huMXtE*GQczSZv0%SRQo#Ch z+|C%&zU5920`jhBFvQ=&5guoja43G)BV-Hx{+jXU~I2H#WUasnSVlu%I(_ zlW+wFPNoaIjy3btZ;z=4`r|`&GAwNdv%`6xECfAhz)z?i19G&Rw+9fDRpXPz8>l=HUSmoZ13F>>4-{4kX!A4};1K8jkOC zu0JC=S^DB)>1SZ59CF#A=?|- zIIn*{p>V@9@S#aWFf#d}u#LV>-$^CUVRs)F-F&5G<_@Fczt7`DIMfJ3cnJS$)2D6* zmDx-yt-plE`b)gzus;pYCx0eM39}S90TFm|brso@W@`S;HC5)>C8^VEJ3GZ)^gReJEP`*&fj*6CKT$xHSeMC8 zMi0Oy4XIVH@3@(vAjlC;Z%h8mE89R?S$8%7G5_6)(0Y}~^);@;X15ZX&Gj5chlUB0 zlD|M|rZw(oH4ADM$B&7HRz~~A#Yacs$r5w%tBjZdDs`&yX~8;&eE5Z@8F2gNPqfAZ zh|#&U{_t2jMjv!y989M?ODgYDqm$m_(e&#j4w!lHbo4N>>H8%VDz~}Be8H9BFP}>F{{VveqjyA;d zM&Fx|LCvT&Zq7dD6HtghDQoTg0Lh+6LKY~@H_NWm@Y8+F1NNqo32S+eA|xJ8)#BA2 zjpC=Rt41@}uKkY%=5Tyb{drr=;a8n}DR_-rPJyfIT!p$=t4bXmo)7}6Bii(XSNiAI zbIPX>0v3usaC_;HI16n9!c$>5aLqZY;UDsWe)UFO4Gr89y;Y6h<}Axg<^n!9671=> z_%}f$|F`k9{yDB0TxQaK2H{_eLDE+8MG$-_zPMu#Y8|@!Z>^ihYk*PT61*>-1I{a7 zcKKK#7UieQKkz;<{09`}YenIoW<2x2VeS1bxG3&(#*tqn;zTg9j(m<^r*@u2PSegI4YUI6A z>}WW5f!I=c?=(=qZ9U%j=Dp`|oH*v)Lu(fC?5y9Y+z%fP7DI} z`|#IPyKB%e7$2d-y}vX5sasY1Blg_=bz3)6X0^A8ujPE1X(h)y6fUi1Fi5UMwZ6|P z>c4JO)&Q(9GpXybu?~Tr`z&QqD_}-Affy1{LCxVkejl{lc5T<2`C2!e!nZBIYCg6d zu&DSrVDt(*9x-zOP(r8w>8y&B=I4TcG}YfcWT)8CQPOJn+!iFF{{YHPH2n^#;KAJ)cC71 zK?B0OI028eW4E8FHO*;q`qB3ov~YLs>-JGZq9%)}SD>B1Z*Y$DISf-w;42}EI5AKc z4uVwvhw;N~AYKF90HiH5LAL*Ouu}g%U*`!Lc&Q*-G?tmJO}Y9xoJ~Y8&&abqTbXQq z3{N0-n0h<2pvBA_gYa<>Zv(WSloisG55Z&OM>Wu6-v#fZs+QrH>Ap2z>t%dd&~?Xt zGo>`+E_1FdBy%)<5`gXXpK7TFyhCd4E|8;2$QQ-`+GVZ-868LW_!Owa7(ICsj>?63 z$gh#sH`~RjmIub(rzd@DA7{{x;+i_AS9G8GQXVW4H2E=8U4`mIxeE@umhwl>q`HDG zvxvvjuA}N0>WNZb)rf~Dl?d28mp{3*`esC0pB^dx?#n^lA0XOE;cgAaPthU)G$Suc z!Xo-J$$(F;fL%L?{>YE5Ys5>q__9?O*CU!mib`%Vyz!_nN3NzyBvgU)@P}ZZ0cFnl z)yyf6m0Lqo59x)yfjH7UrqrN-5w#@Gt19x5*=61l>GzGau0^kNu4W;KBbD?91J7V++^v-L zYnA#47t*B<0Au|@oWXyY7vLx%+nKwM+oXw0SOPESck-GWu(W=xe@`sV%N;)Xp)lV6 z262%dHt1yk_+^Q^SVm!EQaPvVyXPEnM%x-Kyks5@P9UocUocTAOM0t~yy}SSVeI+< z4MmYaUW5&|k@pf-Ij@Q)Ht)f=*gUgBoJuyq^%#kn@rxC8GP72J@2P-smnUd0%nMsz za2%mbY*07Px~DN-(xUA1$}HKlR9eHtYSoMi1bCfnos)HbfyzD~!+}D1eV+0KZBh=Sz`>@k@kC3$}wWElS@K9!FQvLrF^CW+|% zM&D~kdLMt39CEThFH+E(=lm{~l;(=^jIE+utS{NJheh;BzhKuH!aWR5HN5&F?*S=I zC(?AE)7N9-(cug$10g7$Sl)|BBKGu}TK*1@)AnV61a}7hC0*`q}*0--&o4%9zT4&7EMK+CbHoER^_gL*+}H*VjbRj zGJ1FBMnNN~6r{QAdK5r&!-I)%Mh|^78Ab0fs*Yt)s6|)NfF7Cc$L)$qfB+E?qH5=u z+^NwjU97}_c>SHT3_R{PtI<`WjRHnze31ruq$4N&;@>QQ_LdWd!&@c@-+Z<*!16>hgzJ5M#bVMl5bu^Py-CkTu{YJm)%=J6iwNsGZ%`1j{A@l zAZiu`X8HDJ**RnN*f9+yNi_MX-WmPay&EhHgO6l(#yG7I8FjAvn*Nx-P)jq=afm*6 z`JcfvvdU7Ycjw&kQPw|$Ts&-!;`zrujekO>$EaBL`NNa=cb(Lh@v95N!}V5rbrLmn zG#_k=$#zh6;3Zyd@)cV*zu@pXn^Mw8jZ~`4qlnvxr~=s$(hMm#vbC$?(?Tp;mo6FI z`3D#P;x<;cfXv{N^BObqeyMaz8e8o1C6PYYk%LzjIi>PU`jwntNBW4se>ofQ0BFuR z=Da778xmHxzQ^*EpPiOJ&6W+%21FG*5FAaiPzT2;@#ImjI*{7ngkjMDDP(|f=W30+ z0lFXLiHohH{SYlQc=5xmlJNbUb~V%9wy8ya2sB_R8cPYMB5H762d(WIzUis_`w2rg zwJc|8kFNA#mKCuY?Ez#RLo@c0PYh!8(Bi_iq>Fsa#-N-%fwZRC1Pz4INXs~1wsMnc zN@S&(Ph-nz<0+iUeI^I=@0BRaqRZ0DkVUzmYNG}7a??g<)(FQ3zf<-j9=xMu+3I~J zi%W*z5-&lXwE^&A??(J2AdRu@X{4MMG}CqCK?n(d#emHlucxS2?*%#2rJaAAjMFPs z@j!HR+X&ah@*uKXe%$|!vpNDmN9GPaFizl$YpZ@guYAsfkr_}-m?&R@jt)L+iFMBq zvxov&Yrg(jtLE7o{|lE}#uA1c?l}7HlN!Or<-u}wofpd2f&x86t1hUFNIM2r1~#Sr|_?C(F2zzmwkI zvQet5{0>O;P(}V*XE0D|vD+D#5Qbc%ukde~l8v4zHcN^5d@^TejIYl^SuTGeSdgt0 zkF}k^->mS*hvZ`)55G$~KXKEghHW@$@3>71Ilu8F)vtd%XCI$8{??jZ&^qY=?MC;^ zks!qcoQDoN?pK#8kTk1>C(}12Zcsde~hI^T0TgRaJcE!P&IA8`D16D2%z!>rySV0u2y8o!1!b4v<#X~L?C3^Xn_mG~Lk&LxUH%%zM zj+wN|TCttBGB9tWDIpO6;^t=%H~bQEmr+SXfVYQ8`wfuf+@aeeI*?;DHeY*E`&Wji zRj)@TVAdG}Fqd9F6a#&u({cm581qGGZueP`x6gR?7p{8iEQ_xXjuqc#;xqCS1B{Jm z@ypCfj~=*I@dadf5}SMX(!%qnG;jQC zEZZ&j$Bi#N*iEyZskmz|%M4UY4+QTr447PRBh89;SyQVR!=DfX(TH5>dC87X;eMg0 zs@nv2osBwg2-(UpIwXVg`O6?Ozpl8WoxM<88(g!w6QWg0uESFK5ou$RWa96nc_+3( z;$`pM%z9+zJJc$2MG6RDDNSb(`~IH+S7)c1 zhR?`~5PK9$tN#lcXN0pk&JTYAH{W>(2Lq|arSETFgv`Bhpq~F$@ZFc3f}Q6_A9Xsw zm($E9FrpL{Wvd+2t=g>kW?uvU05)KB8MH1q_(04Dc9l7b%a92f>FB5qIY4(R!fYbg z$q;nJorsx$$f}OfX_lvA`PUVutsw2)#_)GAa;!rklfpAPtcM?64=#_T!3!Pxi`5j{ z75MQURhM7owLdboob~v8xq#mN!}>QTF%?b2gkvAdDLHb6LwiY=bo@t_5l60ZFM%xnr_fr%2+bb? z-(W&4i(BQ6!sFcf2Ff@I?X)<@x5MAsNN#TiGIQldTtneKSeE{VreK_HdbLnM7bzGu zjAC9<#Gt=a8t(;YWu8IL_uSmWk96K_(q3~QnlBLL)i|uEg}7a8D*yp^{&olxG92hU z&ceX#u=Kr_@%~G6QI|0hndt3rp#thkX`fN8_(OaKAmDh90=@4Vdb5X8#sZR)6) zMA~UTT6bKt-_IPC&>T_z>#X)dpz!xY+PBQuV-0MJ_aMyh?$ray5@(a4!xIBnk%4bp z(dOC5uqxomusdMD0y1-ojsI#!c$h^B2!(YM&y;gfZY~xB^?|^N5|{N$&SbnTxNMF% zr)*>M^8n-wX>ar7DFua_nWL>WUvy1Pu&L+9SCeLX*Ys7|b*>3|bI)z)xp<$0Hn;ID z9GQ*XATY{5`@DL5EAl)%H}kj3 z3TROnWTVXhgkgXS-FpL08qP@bT2kRvu2s2TQTepDU9s;(EWfw6Ua?@*5;Gkj@`Hxr zIlgx$DN#Sc93Hm7=*#j*U|15N?BN7$*UbsnwFMvcxT#SVloEk@rE(09iX`fq+Oyt6 ztSf2+-!rcgVNk4Y032^)w|mR!dJj1hJQy3nH^_mjdL&nZnCI#x*7D$9nWH=3*2^bV z75Q?}HYhfaAE6}Ex|JkTgu(@mo@L@oHI$5rAvaa73cdRBfFz_ZHKGi47yp%7LSz-W zLn||1#&eIvMTcAE^5}fe|dW~hoGU_jtZ|F&S;4e~e z%koJ{z9cy(^v*a%U~5sLN@?aL+8NH_0VQmZ->TUB`|7U^aQ#iO_#*trCYgWNndi9C zfJXg4I@VO?e_b#LDniGshH^Y`#xQLF+1~e58&OK*G-5Omp-evUkOMhKfx=qySwsor zxfPBX?i;<_Hz6^N(64kn5J}6ldn1Y_oMi7Hwc>zX{|n$T|K%wdgZxm>SE8#@71hb6 zqI`kXy9Cx)z}+px<> zQ9}fE#X)OcsC+!-QkOhb^eo+c+Qph#2aB{v*T43iQSsC~G|K!p%(X)>IY?JG#=NV-eTb9ycGYx*YWy5A_D91?O_t;FE zjP>i=QD;eGuuUEi>OMD7(gta{__n^^Dsy#WXoCUda3N8qjzJ&RoT)GDYpu_J;FT0P& zLwN~g|H2;w(%QIe}P6NIR} zCZVZmAXv+POLg?~buASif^B<=iUGyFm3z}pJpv6>|UR{B|#6K{u7-tV%9W8yu!>4qL6}a!3CZ} z4N|);Vk_`HA>44nEp3 z=YO53`3fgIR^XHgYGz4_eBV+S(;}@Q|0=qcvw!pEuJPh}Ue)^&_;i-adD0rb zf&C_Tu6)=s8GmLcP7T`N10)^XXSFbABO@U+`_|_=4O?JYU9%*&YMD`$Zn#xg-Z5n_ zAbJ?b5KS!#o5dIRm9j?cnnV9jMx3A-)Oa#}zp>sZ$@8;zJ>Hx6#9LM zQQZUY9J1j4VG4QlCm_MQdrf2#a;RB5h4QoP(cbL0;$~lSa6E2c8c;{&9j0E^Dv}YZ zj2)I5C}DK?5l-5;?Hne5WYGkbY2IIJvzYG>=Ku zfX)K5UdRP6U&*jZum6+^E9$TaF8U}$?PKMN3w|-?5~<(Wk?JEV%v|o&8-!QUjC^Vy zb1C>)5cC(}6;gRaN7v={8e(B2j>zOhkAFWzz@awR{PuoF%~~(UP_Z&jhbD+_;thJ_7YPK)BY*-lJ` zFf)>4AS8G^f8XlUN2}?Sh%Kr~#A)?4ftvMezlCkUo(Xa0ByCsa!%Du4sPp>}d$ht} zn?@!7qxigJobnBZ*Vr@rQ^d(eujXcH8^%3{SwpIS#P9U~jfp3VCKMhV>-7U8HKG6AmU2jDICLHO5K9Lsq zc71d_O&w8LZ*futHD6MSh;k;qT_@MP#m}}dm)`^q9K*afQ_`BPI|-sISnUp3Ah)b} zWC3tGaBBPQ`8ZvCb(L$s|65qylVo-JqBDJ)yss&I)f5c?L(<-Q-h1y0l)+R)9(3h5 zz0BEk^+Ee+`_ZE(xE#CeRK$l60nHQSAU!biwy3aD{=!dOD(fnD*>;py*~f(6txN~t z^sfe`dJ(c4aVzAoH1py8L6ddOGFDvgpxrsSTHidXNcUHpsF46_%0)e?P zNr!r=*3JzcvHBxwPuL4Jp5u%4-DHsed_hgedTxnne~HOe1`T4H5?Xn5Zhkk927SSi zs0#f19(VCnrB*;maCzeg+><_Ve`T}3>c||LL&E;YB};1VXIMO;+*eX`C~+0#KnAfM zx9s*t(+*fX4ivosc2brEnU$t->(1ZHl?-+SN(2qn%wyC4s=%)X9YQ4tF6Z2@2CegUl&ObAx2$O zQCZPYbWWHztprV4{`iY01~aEAIU?orYpFHtg|#J*{j;ZZfa^H_Gf(t{7*tF!!KZAA zQ?b4zVLU0tbSqwNxbiCFPHiIbF5RkUSOAX~x-E6r+anI}VT6t=1Y25{zJtg3tZI7T zh9`tZth=&^eIAK0&t&V#^>F)qKiiJiBz(;d7YhqrG$+7NG};HUI4lUaL~+2?UoRZu zQ=O*w7OLSq1aQF5$UhT)xX*ha%f}a6PJ6!Xr+SH)1x(#-{adbA%Yv0q^m2fDWv7E_ zzZ1=ElFT%(;l@aWoN?zIqhHt<=xKuY@FLN8ep$(gBJ{$(OTt!+JR!L} zmAHT9#jA&<4OX{;Q#prs<{2Ijj|gh}L;IVtUivw~Uxe)GyNAj&TCXooVc^U+;UZokwT0!Gn zUsnM4Bk*ee@FNd=UI|*blqwOpHSz5=SVS`-NWjLXnJ`|ghe4ciV^3b?`6pY2I_c3z zCQnmj;NdF(Lk$b=zxpsWcil>r|INsRF zD9%FBo%p>M<;_@Kosg_Gd`y0R1Xr#(h17WsfdVWKhmFTm+6P}Gi0{ULty9~jx#y=; zHEiR8sCdVj^A-K!?+8r$8tG`7FPOwNkqjDX7&BYXUDt6%RT2pkC!qM%Y;TnJ)S8D&leDy|ZH`k(VV=X$z5jSL4xIbdpuG zQbHYsXDyCy*z9yQrQY{c7FB zb2(ey=qyiL%z~1eJV~pLoW_Vi^s-0NXGsPauL8+Y59L2j_1Z|QR~ zcOJ(j+P$069t$S2F8>(wr_ScaW*N2EG_II|Do@=(n%CC-8hTcul6tAjE#O2^N)O)q zK9}erWu+__tviB`2!o<8zuAdbo{gQ5&eXyo^BIWkK*u)oZ?pl(jtnon#p3BlU}EG& z+Nry?@>Yk_;XZaws`mRHFLFQ=HFccu3H$7>@uWKQIGV1TsZv6RzcP1i zL=BU9(NJTCG_+?gb_HzYn^e`{nS2Ze*ck26W+y56^6IOHT(=t<*IE@z_2>;ch#ve9 z`GA=wwLe>tnQXMZrheZ)98tFbB2y5b&Ypad=5TKd`EqSRfazoWZ0DlAJrH@czvvzu zd?}@+V>35S%1qyX{QeosLo`JBc|$_|iyD=;$yL1YG5C4@k<}SSpTLL_Ob{&fQylL}H@eqKj$DH?@lgZDWFu z2PrHW10F%4LL7Ys(oA-3l$JLbk|c~;m4CpcVZYP^Pu zL68>fH`*~)ZXM)?>sdSIPO9lrR{l0N?DDA$KNt;8lI=8Np6NQJ(h zfiz0)EpwFn1MH!eGs-K9HdGB!!LB$+WnFs&lxCgbH4Wo zBKo|^FvjKwHEUx4!XTn3xMl7x-Iv1>dS^ z5IO}z9C^ZcJNiTl?WlTE<3fqHn0l=Xf=46n$?=fUIp`*+f7m)7NbM|0 zMK63c`7GPfMDX0%5gvtzrCbn5^wkma$P>%W9TDG`4cG^USjvmVsLAI*Br_gqU3FSd zKs&?3#kf{c^!~jQC~HJkQ6t{eQ-UaI5P1U~-n3rjd2&8mPQ+riAT;0@xfhe=clde1 zvhLt}D+y+;?N_mVCTV3>N7L!&6peIcxHHP8h|L1~ZF0AGTZ`PHV%6#UoE)?V3h}v( zp7CWoRkLlmZq!c_a|IK$msoznlcqn%XrljS+&hfraD8S#a^{kl-j?(7@mB$m`!`J> z`EiXAb1x38#>*T#ee-dy{wVGEQYz((pz+8jp_I^{5p{Sy*YL|U#kim%MGy74vcN{a zNl=BN?)EPWo@ldP9l0ZR%HB8RjtxI^^*8S9WO5m2Ve(mad-Hhe4l9FNY)b1@3Om1Z zjyj&D;MOCff7iexaYA3XUwKBZ9VM>UiRg30caF2=QHk>}|5Gx8stw0xlIc+oR=-OF zOZlhB*9^l&eHL8<2@E=uw07I0uBi+9a#0{8*>3RmpY0kj`4!djjfKe%YyOyIW>##~ z?JJ@Jm=WdiinM+W`kD*8J)dn(GjtUsmP^>dI}(#0^7CN6&3u;W^vP#a zT}==N-)$k2D@wi@D^nQ=%`qHs+2|pFkbPNw%lEEO^piJ&tmY=YY&<=mL{Gkl4v&@c z_s=*>-`o1Ds5+}WD}y=-o{3N58M>Zj@92w7Y*t2{7G3N}IUR!{djE)Uj+~nHuK9j+ z>Cz8$T}cu|=S@MNS4(`I0NM{1`!{kFjD7qjh=c$HRJP)&(w&VYWM+zOLEwGUNyKNk zp=jx|^~^uAELjgl9BjwUb`%`Q{2d;i-Yk@SCSMUPiseySnmSVQm2pF0I$XImB;C#H z(Y8_8f;=8fFLyDpH@>dN=m!f1Kgdv=x;h_bhrV*T`ebBGp+=uYPwJRH({|+Wb_ddB^LV29ob}YYE6$i4$mwo_cgv6;i`B6VS3VPRJMBqH?DAK9tg2=Bk|>Nsu&L} z0R@k1k*w%DOid`qYJ|^8(5LL6Vbzh(_a&~I+cBAe@sps02OEhfEAVMv^r`vg`!qBG zpE~<<>TJY=;{vX}H{?|6BO@tMPu_JLOEL|QfT%T4=ad$>noP(fJy=Yive2)jnfAus zI)8GfBvMeV8f^-GPJDg3X%iz)Dms5 zFWGaE+u4l-YtEgbTZP|KGR=A~s-llX1IsTjIM{%di!W zyjQ-ZPY^lRZx0ZG^HJXfiCr`5TxPd-(ujxWDc0Ovt#bu~<*J~~2752D9=asGH#aK_ zP9r0TToEsQWNxk$g0(=>hs)-eQ<|mU6PO)KPP=84b~G+HIxT+8sVmR}MT|}&YRR`b zvGI2Q$aKQ)n^l(O{*GMf%Af-k&-MyKJZ=LY$ar?(+9v0+MB?d6V&7*(=X;rdRMotQ zq}r)*r}<0<4b${jZa?nBRtqV;e@-gIC* z^y+Vt-Z(s_z)qa8P-*jis$sPm)kT&p?$bMfs&@NT>> zBS27_YH`e)pc1i0ZoTt*DR;{1DI%s7l}bn4*5mjMYA$cwuqQ`)+;Y}LiC``^SE$*n z6y(s+Lo;tYQXKrO^Ks|dTDDSNJ1J|Y1y67ygK*5B1K15wn3vMCQtk*(#yoPaXetQN z4*@nqd1`C>Nkm7R4AFs}r;$cw zS%xBus)2Bo$#Lqmka-U$A~hF+YkZ502T}-bLa@NU4J(&M9COq#E=L~`o}|8dI71GR zOaWBXbPnwhc@w4d7J34{vu|?{gGBowR>~0w@HgJY3_(Vkov7M1)bHRpoQYzHEaci3 zGNHkd(?io~@fRs0RxY3#2Av`_?V^%$gpvGXNWp6lKMMdP8vzZFF5~PFh z97GW@9$5Ik!#yq?QF|f!*m{NFOBO+jcqYr^4RBp++|o;0^|x-=Hc5b5}a4TLs0#RFCE9XW=H`{&UQ9I<6In@|;;LiZC`p^lZ5SaSkPq-dnb(XA3Uh zT^o=HFM!ju6sXw`Hw61?mU@HikhO6h`SI(%ZahOg&In}r*C7EV6F^0%7(%^#IE*?G zhf%{)_=8bbiM-3o@?`t%XX)wbUhc+gfE5^^?Neqs`cDOc4d|`yz;2elC-n-faQy#S z5=)PX<$pZZwjU*e3x9J;kTyx`cmQApq3MjII}X$qV6Z^0Hmd7~ALlIO(ye}$;%#*T zf<8k0YN~~9eg|l2c6T`|@@DV-2_D=;@>)UxZBtP3$gN#Pa+_zyTjKC7;B_N_JZ>-W zx{q_p`Dq}Ib1jDnD#<=?bH>6N*3kbv4Fs@1h#UM3*du0;KYIcTjKQBd_D^wu8R@U( zHtso!+v3}0NfT{sVfWxovAM1##eUH~b*YmUkn{_#dr&x6GT8O%y6PR8gtPp&mBk0>ZpTns)|Z_^!_J-Jh{_8{!7y42 zOYlqkBF;XrWn;05AXN?Sq?X)6CtVsL06nGZM!w zpg!XEKQ%%}08jXt@%s0kKl`sq3>^aZCC_+PZ7)<*AThr?*Bv(-e!x_U%cTjQfJiwc zrl0fCz!uC?4q=fjj|Fuvf^B7|A0Zr7OeGLhY z2M!-sBvbSN2SvsABQ`VG7uhpGyYauT^nug5wBL6;)9xN{>i;p-y&-BqSgjNS4}lKk zZ~cHw{MX?jGeHhesTdr+gSooZ2LBAe>N$0lz_-)2W_Al+<5UB3bkuKCv390`!8_|A z=rA84*&Xy?tsB;DLhOGy*FShnJK1u85+3~$V+`!-PXM@EES?9ZA&>-w<|z?2_+5fK zNoFZaN1xM|0`;KCR@y*?Jsn7L-!SZ6v~%K0anlJoz%KL4;2sEg*{R5hi7Z-3Ve|#E ztRzW(=Wxqu!vYXNHc5`Z#}TSvt~|&RyACEL9Gn53RK#5tdW=PUN+t}!k3h(O@vCrgUV-87XMhtT`IX8Ho0< zZH_I{>L7RnJd4iM8;=m3+pTkfhoiyTr)cw%!{6QxQEOY-g5izh6+CvF7ZJhrUCsY* z$6C%Ic`dMz3={Zy2*Mv$lLjUTi4y}2woFtg(1odSu3M(yGO$DpYd%@C2ImFKJ8(BX zfzBJ}9BTQv(jq2dzm3!(sED}KDu5I6#RMC78XMn<qQGE3l zuQfA3fUlV$OI6~G4fXF!%qpUqZk3*M5l@+JQi=)bXwc?P9o< zR*<5mJupe)?l3X$4uWOHkh$c7Vf{X0G=(bzkmnH^Dz!Y@S1{C-uI$3S3Sz`{kHml{ z=3`f{z|kR%hbw7?s~=$?ru-p+gFym0S#+{k2UrG*pp#auC=Gs{-~yIQE-x{*K)N z*7PSEV(CBK8x#+uB#4Sb*zJTB-l4MQXn`hAVIxz7ev?#ggi3iN5n5M7MGtkN5f;AF z>deG^#l9<&CtD2qMA4$hdJ}g)Kuiv774Q?ncKH}D&6^PjOJA#4r;4B$R?7#gu{sCs zW+ZU;?}kc7hb)o3cxW+1E{~?>6Pbm$avKP&PeD>RMH$s#8~=MsuZzjIbm1Kz@hn5~ zGr^o86tr_m7uXAdWL%^#vgiMP{m%pIx5NT*Dg2RGAS^$mvLUTIlzFbP%ghd=pPoJJ zNWSK(SA{fhMZ>O67T|ro*EH(}ArDDo-AIx_;{n z*2VAUn{Sa=Wad0v@4BEGq*xv`vV>@Nwu6TV?DtX-pA2qyB4yN!IjRfC?tJJ4?X_UU zFg%1h3HM)+_iBq+ljtQ~%McHr{Q#}}8n4Zc@n3sB@>5nllBVV8vcSepmee~)aDeE= zv^OCe^|>!91uoW8yHLN;k68GBd;kAE^S)5Zg%6X$$q4TVL}tGazJZMIClJ#eIo^Jc zmeU+^JEX9Yu>Bl5fqqvC0TrJIW6fSLjz9Cb&|1jh2;hyE_q%bmbd`8xv>_p{9;|P` zdkhtHil?Ntgz*r!9cJWcUOvy7!`(bb?l$o3L>HN(?kgtjL$>duB)LF4xY)r<14_8_ zo#TrdR<%@ODo;+FAGYgkZxj@aB_W z-bxWx2Jo-n)YbNtCSBK1j7ruU0~&J0mbcD=`I57w5y#|&si){7?DG#urVzH^iy&gK zonQPIY2%?y*I7UMb1bcX3XnfGEkrke$2PXCjd4uuggC%RgoH)meq00h#c8gzSJaxk z)zzQwNO@RICN#Td6LfM*H~@6Pjfgj$!EI-$r0!^;pnA7*otj@?cG>dv`rWD{Q$Q#V z23EA`C;1DSt*(ZN}a|;nCSEDED2@v z5A6;Y1m7gNEAK!8p2O)PYDytUahqd?oRw1|VP&`309|}A#oo>;7;x$pxqq3Cr`<#Bc$ngujOag!aAp!4!UFObaOVF+9#{EOrCBaP=v)OP zqd^d#FI?xb{13yyG8}^;*$z2LAydi@xq;@8;0SZa^IAgX{u^>VcZ3{qL3ZRum_;3j zFf0pK!8$*Gt+atV9iRMJhOS5a=N-HH6U1Tt*6;A49D3F(Jx~L@8s z-kbn8Taa=C_{i^Hh}_&!rSoAbfm4v|#GMO*hm^j1)d~-$kEaaTaB7wH4uN9)VOxT9 zK-lL(4>X&mCBqoY%K#ejg00{uWC>y(rI8ZaS154MV-M^9h8_?9QWgZp-9ofCV2xOE z!=rT{x#nzvlI=&AXF}#);dZnTga2%*4l3SjJB4$VrHDbFRMxGDwI~uI$ad_G2riyq zx&fG>?f>KJzvHoP|Nn9P~>UQgT$@zRfkLPhb9{0z6kcjfR>VpoDvf6rC+fUUW#2*9%!~L zk2WjAAS>m%U#h^{IgryVwc~m|Fda5izj^CoYgQFn5JwFlsbd+CgjFVepD2(Rb_7y= zp9ES92wr5Kyl(ZS)Bb+Lin69G>4w?0dsZ)X32)Flmh)A-eTH|PUbsHz*a4Kv(aLXL z?!jdHF-EE+QowNi<}oOxtf5voGDB2Odvpi1YiA?7ygf`IX5_Y4EFYyZXhIuL;f2Ub z9acVY=WY_9gW)u@x;-A}%{`tXkNGz|IBj6|%2Y?kd4@=hQ{vpcYqIHuH8d!%MDUn zV8c89bDtuSJ3eUJ9-MRGr9I@0F`8!Iq%u3SsQq20;h>w2q@M<;`rz9>y{LY@C|Aht z#n1b~OqyAGK7mo+ip;X=I7-sqKGA+V@}y9Shs%HulZoH+2{1(*9=sfFFJ?d>v2t`r zhx!=c^1>vFI#3uoek*w$P-*lpMd|B@-(!6v-y{VG++pA$EC3ICmxs8BcOOd5y!Ms$ z9-}E@Qt=zW6_LFglNYwlZlDt&E7IeScZTb0zz498gBC-ymff@Q`y?a2@Yzg21GrEW zEI@^NY8>C86AP*k_4*qsti=ZYd;IavXIy2G&*|Q{oc#s(0fps}vR+2FUkkK<0$Y7qp@UKDbvm8nNIO)VaAPXG@(r(9sOT_&pL}E`D7%CtlSm7e>E_Ar`9zk!a~R6Gd8x_ETo_ z-$8tN0Oe3?w`VWlHu_w(FZ8PT1f>YfL~3k8dXa0&4O9|e+%;TBGk+%Za319S_i!Y^ zSOv8HySy-PPR)9Rj4FU_4gLYXU?q=`p1(%-!#mjy97DH`sFv{FA&{ez(q|%|2&Stv zI&_SlC|c-ZO4XZ|oXEBarLT1B*{zW7e(q&?rIO%FE-4v4tMK@QNBCqc^(@{#j3Phn zG)p@tR!UY1UAG!D0ya9-AO&ewyr|ZO{^*=G(H9jBJ1=6igxaWw7S*Vn2A(Ib}BT)sUq|9Jy3IV+MU%thBX%f`VA#|xy z)dokx(aOyp*1gs8dVh%i!9~O}yo10fqt)QgBdRGD@aQJJpYqP$6kmz@OnvA*9el+U(Oc!$+Or z>A2z^IM`4)K8JyJk{g)*^+85p-{=di+3EnssW?1do53}N#~{mNTsLSJ_OQ-TT3!Q>$ZyQChVIVS81G8-QSuX8byy z*di2`11;E=O_-cN{V2H2Nv!FgOow}HB^aOolys~UaWMbjRmvO^D3T%1f3tRhQA6u_ zT?Q`rB&^{7zBO`@gebrQ*={xHF{lD#a*_yhmh#rS+usmN7`e7Me7(eJ_ZET;xPt*D zX!&V^Vyu;uQ{45A3Zv1+!>7$CPy$JWP%e&gK5i!q!9C+!vd{$F%FvvlhNzRdl*J~A zLC+DfUtOLLbI-#SIR3R#;=s!5=+>Y^=|=ylnjv}&jhK;2=R39GQzfh4PnD|tt5yuo ztvvgHYjcH=D`1e7uX8&~sq0GqYrq@X#Yty1672XdVnajF!UdD8_U3cpP+}cQc5rC<7*~hIIK*D%> z*YV20KeHduR7O%LDr2WO6koDSH2mcBdW zc=(wxMD6|ZCt}`wFaiH!Vd&a;?nnLsN)Ew3Va?;p#ehuS-1D=MXJnTE|JT_c5<3M& z;4*~@f4%Qhpba{8=tnsbvn|18(HDDO04G$_ZE5aWdLZ`dWXbEPE6CNFOvF^m3ifD~ zG7*-P82QfmiAe-jc~i?9FjZ%ov>%tCcG_C*)PTlelXV6FawRuoPo&MILKumsH3Aes z;r`n|g?|kw6C#rYz9Afs{cwAL7=lQ2L$QrxOw_xVK6gxZST81JK&z z^w9fSWIuDq0mhw$#lLz!^Cp9$(@#sIc1I9sCw?LAutOzFFI#j=@)L)8OB-CUg}1e^ zE|l}f(^@V7g8#_T1KXtd8(CSbV`tKYFsGaj<*xf%xK!{d>0g;S-(;A6^Cog2_1t~! zaS#BU5(P$&M!+7kQ;w}Q{Qa9znMqUV`Ym~t`g9AqMCqbMmlv%AJLrJUb z_8YfO(YJS|PC(h)S7jR|I7wq)gDb(z@mf7G8+$reX0LB ze9hd|hZf7u3=H6fFN$avxCiJJJe}4qJWNMVrWL3jH{>~DpW~F9cv!;KeYv_9$5>!u zM|KCdN!$_++MLqb=}A-n(1;o_;k9nK=6Sx z*hchq8^+B*8k*LNsj(MuuY`XC)YBM@=}cGI&(GAIeKg3(2MXfl112?3VXTedNjNBS zhTfU6M80uL2$A!n{l>Pbf4>OX!wJC$xABf2U_k;yy{i)-MevOL|J;|o4R7zy zdUSNXH)Qu(XrDLy-+RuzuNFw!RWFEn?>%xYJzAe7(oi^a^cD<)wtgiR&E!S7==!2p z^XOj)jlxyH|4t}kXoG8!Cx)g8v8;p3n^}Nz4#5ZfLHQqz*smXci=fC1gvNNo2bVYN zcRS}nC52qRZooNNgZQz_*dYTJy6gs<(zP!ZBD1f+z;WQ>%a<9ZPXD3`QcClGK5=_E z*<1^O4AuwEPh0rs$5INHRKbknlO^EB)Q#kF0PK?q$hZn!8K1C>t5 zZKwegVL#xw`1g4Y`^=-Ozr%q+>I-l=;=G?jS8Y_Tc9pQMH_v>r@qqn@A?>?9!{w|Y z#Yf#dz5`}I%o)+H+Oq@lc9Y#w?A*9VXgW2!*G`TD`Uh}H7Z{}pEg0A^9PS*Wv-IB< zb~d{d#k^wCxtBdr)`(x%PP682OTj1rtxqsW<0%AMqeY}n3Qw2_(t!Xvo_fnd=WYjZX%WiCqZ2S#eR2o(@b2%n zz{?iA=kXm9I$k*Yew(iY&p%whwdXU4xk8;l!XT`$o#=mJH*F+~N05{j-LJxIN$c=~ zV-`~#X*Q|1(iS8mGb@mpZ(K}tG3T5XspYc_cNrmQrEl8R*8t+{fh!wPwHn5~>!?a9 zVI8@-t94>**{e2+k27vR)bG}Dm%r#vnK)i9Oujj<(1HvKnjc+~ZV^)}nyv&K_TH@j9h%{^ql^EIic}XQ=W?a+X=oL{y zPoV;w3PxQrxeW?+37uAtWI#GQB_CedP+pQ$SucqA1%js4^R!XbX1SL%X;rHmTmh{- z8dGpLc>mBlapfA?L10>M9*9urMudsTiM(x-B&=A(RLwNLq{w?p0UCuW!qLm3)WdQe zwO5Y2>CHO3>1@@GK(>Ij8CuktZ!FM15FWw{2%F?)tg|y4(649ObOMZ}73Jbs~ zC_JKE0mtL>$Jp`QA}{o%CE{|tR8Ht*SxNG*AY>L_N$E~cPKq;^x5}c3Da*dzllzU} z!4!?Z04$9}BTW;;>Rgg+qNNSPh2uVk-+qf+&>xR*(KXQ(8q`bG^_jn9D7~T*9Ppjif{(?(&nJ__r#ck(grC>_wjAq5-qN z%cldJwr39t))R_ox023inz=R%TlYkDCUBmns|bjNl7&bQlLiD)(0r{ z6eW<_dY^23lh_slgO6{DTm|2x7RMQF=tz=v+?$wwSCEly$R#k zo{+~*jlh+>I{3)*2QMPWlmt+U*~7^qOKJ}UzV7QLX5}89zEaK&@#+q|gk@uKNHUqMv)kxlqG z9|6OvCGBE%bK!U<+ftdtA{bPzolNl}XCAQpD5FtBLzP{-fAE0$D=19TNZXWqM!-iV z-)MPWoOZS-qKd~woPdEx=VBVS>RkX9TD#f@tP9*j;7&0Kr8_=`NzVI~NQi zU=Mu(lFSYX;_xS2?%81Flk;l$=?XGkA_D(=bQWD=S%d!F!R)y(Sm-!xul;bK=Y>;2 z|8qV+wb5QXyTl&kM>&Jys4W)LVaB55n+47fsw{L~i$BC6c7DDX`o5zq$;diQI){k~pqd_KUH+{?5itq` z0J}jpC?3|-ifMRCP#hIGP@pn6c|0xLOqC+Q#wH!;OSFX52*_Rqi~xD9<;+G&@n34dKJizwwZRk`$@5N{zO5{ z4;Dng!of!!t&&*^#UJiAGHN>hxN=cp%RZ5_8HIgk~*h<==-UM+EK#%#H9wR&}m;i{AEqZ=JwQ!)=xd<%0Rv#L>tGE#4 z;P`$$4G=eBDAcsZNYPx0OlYBJdczR?ye<$pSK(<-nY;7y!e90X-~iR-hKHZCQ+$t7 z2lrh-vC2mYz2GTv=+k4SK^FAY1zc#v&O(r)i5FH@7Z8qfeJ7%g|3jRa^nK+G4qdlp zc?3;>@+-jqAp&n3ApxuXb8bIA1Oyfg(l%9zN1Z(aTN~i2DDnkee^N>JuMQx+U6tmC zd0=+!0@54z*9Noob4sjT()_i%E61nYqT-#Y0fJK2yWA~RG(d*;G68X};JH{QT`jn^TQA(ARc!uuT)|tr&TW@>Tc$nr^0sN z>|?SJkgj!V7e2EfMUdmJQ#9(`vc5(&~DrU#1)07 zfru##?Xf@__?;ij4eo~f!C5B(OuZ|5dOpQi9SiUR5{{xlmOpADc;JJ3DP#ACWY45= zDnty4Lj;Sho;-GF4BQjBPY3)c8J_>R`rY?tD@f=rD*4KjU0)TzL046a$VQmq`~yo3 z=+aCueK*XOD!oSxj@=|yGWH$kl=Y`?@3`;1dXR_p92WrVza87bOnH#XW~)3{5Q#n4iFvWs)r^%V3$lUUoG`TiQ1vh%ByHP>+^Yq zM&hT4cU$Gbz(9xE4HWCyLMg?6E`K69%0+fyXM1I%bhD?0rC`{1N(pRAA*<4a0#R}P z;0*N#L$rX)xzZZ9-Nxv;)N&tOrQL|0io34YHk`qMRDTdf3ii)bV<>kc7rkeH`YwRo z?cfbh`=%PWYzQcae$NEx8N(i5Dm6D}6y$CVXWzw*4QsWd43W~F5F8x!3BLYyB%&Dt ztS%gL>QQzWuvNR9QBXBP&9^t^&~)_tnfrbDeZ7Tn_yL~~cpdrHrCYyjik+52c8rD* zJf8}V1ChRykiKlr$y%UH{{TEh;Vmn~?T3FLIpDZL)^$I!*J}~}HPzXK)lUdT;Ew&E zs{{9iFEFp7=R2O#kYU`3lEeSddI6cknVtV%{aE+^gZ^r(;?8%(@}_2nC<%y?#>wa8o4?ZsIu-q!3qnuYwTA6PO`y65lYZ>B!bt=FBbN z!zJD*$-zo}iCYl6;o{t7U*XnayNuvLIKA)Z-7^1mw0k~o({PXNTJYisyUJihXxi<} zPv$*2bsr9R5|UbH8#evw)gWgUzkBd!2S6v>#0c~LE4HD2cN@P|fp#<#6Quw6-@pnC zdq<%LoLU+`z&NnJJhiot;>W`iaz41X7uRY34mL?>NCy|6iK|Ov!E#I!G5iQ`e@APf z0ULPM?%D=7I?*Zclx)o${YGaP{SA=vktTf;b(~4W5UvMr!K6?JA6*&&YSivZL1b?9 zlH*?VtVbg}FV6eEJVz`UIg=jA8w+|M1X*^gS85?AE0|bW0~>n^sSOFxn$}KXsD6|7 zh-`#YM9UasPRa~fQ?Wy;b9eFmwO;Y&dXL2W{~zZ!LWSVKf>a8w#g2}$NT|C$Q!qLL z5Wkw1s^|0c1bT~+58GcO&p-_Vb`;UMt(G-%u9q;sWl@imJ@FbKh!8ut6?IU)1fgMt zc2j+Q1vdSFiV-gZPN@%m8Hh&N(TQG-8(4O%)(Z$i3;;j6GY{s6v_JX_d@U~?{9rLC zEPiYd>B3X}c3+}rCtJ~yvwCjJ=H`qjiv$QmlbEZzL6T&+sV*DFP z1LW|P!Y-0FG_QNT{a{($RofDs!=E~&WMbamZo9pl1mPt(cun6=zFK){I=e|Ep$^3Z zl`6JJ@aej^MQy}-K zK#}S`y+b1&f)@7-^csLR3+?c1flL4~+CZ30(I8>KMvx)L@Z&nDyCV-bqM-vWN(1kP z$auVGE;&!4@-SZ8-wzon+uH|$(u3&q8%2gX% z6#cV8Nobi^@3oVE2F^tf3L$=UrEtYSiQPK_{;pyIcW$|*ew@5n@N?&cv(M!wmUyjF zx9%(EJNmErC5oL_?r&sw6M-n0TF{S=+?A78;f5*Wn40+FcKiEsme5NM3L>t|>=j$fN)a8r92t)5k*5`UGATj>oERW%dnMp3hvWHP{;TVg z=6VyRKX|dyterZs&cIl?VfOxO&@WDVk3;H-mD2mZ@Bg+a+DnM7gaciArZMQU=0dV= z)w3h_!7Jka_evkPsHq;tLl2VA+F=zThSoFBQ{+1G*3b7S)D&bSA-g2Ev%N(e_*$pc z-3{nAPH|ixBYs{v?)VR*UtrOlhPHpVOQ4N#i)L!Q%0KW!)Atf!khdRyDn8trjFaEK zKlJ1Yp-eCHB-6wj^EmbP8iO5OvLo3WhwO#S-XGE*D)aaf5;tTq_ zxiLf}L7O%KXn23Tl&AG*QFl}XIm+#w1p~%K*EJx7Z|Ak%yb;(VSk~Eq2l=h^E?xPgk9kVKpIwygJFVsg;T1iTYBr zd`)AX$IcI!&)V>QoPYE%fPbqG=IE|^3oy#*{d)TZZ(BR<;E4(ETqy>^^Zwlhr;jj0 zX2Uqms?Rc8Xm~_cI})K(lji)xSM+ZX*`!H>OOTUN)HBkr(oxkOrKd6ii#lXm@`9Y7 zY>sR-$_^%?r)0F&-okC*=EsyNEa;NKW2w}VFiM`VJDOqbGMuU#ccbrXVyYvDaZAib2VTOR% zExqHwyGn`u@iyhlBrP$o;xoN9?#F+t*(8hZ9${ zp1n-eB^J|>{HHL4X7+=OFLeQD>;vcms_AB9=3ERp9A7-VM6_}j{Mv;hDq7qsAAO=) zc*NwVrL%r%Onm7BU;H&(FzwaqWS$$~m5P+&*P0?fSGix9JvXVQAf^aNC_lA&f%R)= z7~zOrGw^7HVDg@YP~%YoQ9j)Z;>@O43I38#=yt=?9FmizTufC}zdA{1F7KxmljrHo zo1EU+!_qYZM8l;6(Wmb{5(Nhn8=>V~VQbg;@~C+>UCxe-fQg7Ta%1^&*?KoU-fIk{ zf`aYyY&b_)-M2ux31>bhj5fUY!H<}`V}tt=l!pin4jmGBrIYoAloopxuwl{$42uE& zEATAPCt=mW_&p`PuMHCRM_Q*1?Fc(w=*^`L!HCePygQ(;hu+-<96a(7+XY~N9+8XHWf zWq`mea@*bt@11j8MMS|>NCf_N`h{Y^!x#zzAgvW5A%axC9ImN zcp6&d<h_A~#u!PlA4n>!i>n;}4ez$_CkynD zKd3DRvIo1=6Am(weG9Kl)lh%>t?j7q%coPI9LPYuR0v;h&q0Gf|Bon(2mp- zYT=m+s1K_cdbrJq)T?-u(N7@e)NTg$Ua9b3nTSdiBSeLPW(Y;IqLRC`K1eD{2JlgC z93ST06#6lGjOu^o;Qt&#b_ZbU+Df5#EKAcY?oovF%oyMs#oUrH{T)pIM_`HcnSz=? ztrow&1nBT{ICoPL8RG8zk@qW&W261nSjgtmWl`_y0y<{Ol}?DG<@_j^uaw4(1=#6>5>thmD|A zg!}6}l7T|%hHuB-MOz?9Db~)31f>XNud# z1z}k@qQJHQ)jeit>auQkmi~4@4&uy9KO%#%@U3X!bRE1?TF)Qh zl06i;^Lu4;`|k$6?#h4ESNT@%mbw+IXB+qv)f=4yyTLQ}KJBadC8y)~(3BIxpc=^=G+H1{1w@*#`K{vwA5nO_7*OP$_HfaR+1g~W>CF}l;* zlD-bP+celJ;0+6y6g7gwOr5@^L6dxXxFg`&5LE+C2iY$R@Jk&GylTX%vrnr1ypmH5 z+Vu%2&DB|G+5?SjU&0;Ugw9|O9RPO$%Z!si^5@0e`&k1`@YX?BHRlD5*BFmW#%3rT zYhgec2s5TX{tVa|e)s**`%4AR64rGYPH%koT}!Vh?^4tVkn5%Yc=!exB~N!l(zhMpkRU%-l-b4o>K<^V#< zr&u&2>pD1>)dUe)qKp}F#c^qjbS01MfqR9=d^m8Jl^)RQ#v4}R+D|rMG?MxfF0R1B zxK~%OeZL7D29xIu0TkGMTjU1N<@XtlRP9*}3#pX4BS==WCkL%b6-c#K`+I~ z1F$LBO3amS`sT$v8z8#?b-peZbNiV!PjPF`Qwht)Q zDe6Ehxbr$w1=~Rc)iw{97 zx>vR_?jC>ee;DOz&pru0+D_j)I_x0tzCbAmf{LLBg!CzI*p8n8Tx)roidFX{^>2d} zhD;W^lqGabBE9E_Y;lOR>4!M7mR~mcd$!{+%GlL~hmo?c%=C6@KmixvME=J7M}e2< zL3S1VoFZwZL5SyYb2inM-(4n26pX&p?jbsw<63W+|MQYYF9$5HHqrGoG@%2I>iQZh z@hEab@K}J*0M^5}l-|>FFiw58%R*jHX=D8#f&lbwn$O$@rkQjFjH1whQZSYXcumNX z6L!HWU5dH{l!{=T&z~0k_5VM8%one&jL`nw?R?8uT6L(|C<1O9O%I3q|69#TEhGkt zPr9wQ#ubvgj$gdr@lK7u;$*r#yF3S-G}*# z5TX2KHwbtKtdsQ*g%GNO-_skUOu%j9=K(lR56?u1K9cx;_>bA&QppN}YGq*S?r{it z%A?>DOoOrRN1kNk~ZuFlLED`v$X=_&bN4f``1ykqfj5dRv?hvE>F{RsA)Fj>hE_ya}I%E31K zcWyHBfBvigLViS$jC&njRjaO?-Gj3&4!!1QR7-PJ!jJw1ApcdI0PR&x;w3n-n)(h$ z8)nS_X1j3FP8HQ)(ie%VC3#LwUS63`i)j)pGbWMekU_bkgTgVj#k6u!yN(^<%gEpp z6`qNHEOeOlC%i7@#eg`@iVm(R&Dwtfmj~o_j6 z#m;YMO3ieut^zr@ciZfj#|^W10!GaSYGN=$w5%D4Bl4pW12e7eh#TM9cz3feNxZpU zM0k||qKC3VNg~cKO}2lQ@XRvWig&x0s3x)+0Fh3W63#QL!+{P6{7=c4td~t87knU9 zL5P)hcOc2f<@nNqd^C6QeCc|^z4^f8O`I4QlX*2xT)AlklBi#PMsLZo^8ry25P58Jl9Dw)asHg6&f2Nq-K zg>QRb7^FanqOXiS3u=Y~C1$&3mvFhRv4yT1_NAv&VnWKrzh-bmw2DZWL zb-uD9v?;?cypc~`8DYzvwsx=>qWRa9e*1Ed)Q~aTpu#|527;HGC0^Av{$fi>@LCHV z)hH{z(7iO%s|rgXHEgqXseo^B>Bi>d+f_w}$68`nZ(flBVsyKZi$C{ju=7p&WwIaz zL$5^^LMr}uijX=ASZ)mRyjrDJ^mgnDkp}#tyaOZ4v%8A2+Rt` zjJ+62IC0m2MQl2u?onKFdWGghD z`~vnDJ2{hqkm>q}l+eph%u*amGNT$XW}6J7?xB)EkF#fM#tn5j-XH?y+pe;EwXTN* zVsSyM7#Bo0wF2<)eYxvZvA_bY+Ydb0&WsIkGRpMEn%#!$mi!0c#MT=N*Q3a)`(^?@ zL=q*)k#Hrsn6?H#cPM(O^1l3Nz%MpJss!#8w}!1pq}5D(uF;^JI{F7Oj8_15sQV|} zx1T&`(;MMViBPJcfs~S(KAW6-tB-UfUp`cQ`^H-m`@JdHdQ3vW z!jYLy>sw>TMK)fL=K^}ve6b=Bl$Nv3JwXxY!k&^YvVB(8b=U|V{UB7DV0KPtER0_G z`cF1fl*$JZM@N4z2g5?SWv{T~^Bdg>G6@lhTpjO;tlH_v{Ap|4DVRp%DH334F7 zy3=A3?|CvrWcf#4L+Qr=dcoz-4m{EYiq$Mo#uh@?!41ZsI9C#}>@ zNDB44cj)Uum)y(P9o=Sse_EoO#iX>S{>YLai{;rmDTC>Zg9p)Cp}HCfL1 zBQe(R>$x*_dgq7@=OmG`DId{r6{~Nf6@G?R=|Dw2q?zUO=~h!I5jx&G${1uvFi;#CU+HQS?f0kyIqW-diqu++%!*ldd@p?1KwgsLK|i#A(bWhW4vv4 zW0o_gs^5M&Q-UPKjHJLEb>Z)Q^{9sN{k8Eoi2>HgmB!pP)+$3*1gr+@_=un9|5)`= zdmQ(SbS%{9FyaCeqwKE4w@-J%QEq|3Jx~i@&qM;er@Sxo!ZNQlT%dwgwCXNoX8S>_ zIFHxVE&V&}AfOxMz#k2(uru_M)4ink9SxC1 z2Y!;lq_IXwBU(ABVpy}LK$eMre~sjak_uhP9bn!&9=hUH`%Eh#k%3Y4Fnf0KKaI>} zg!hgnkz+_%_8tEkfY((!g-R6E7^A@%E|MMeIlx)qmsmrdBb!qLwd$p`m>m2bBeG~h zcfQjGpFXW59urBRFh9h?SsnTHrie*v-}J{|e^pJ#suX6TqxwBO{%sdICP)m<*8KB~ z@=@9`aGkXZ3i5Gtnr&rX@ukN<0#IAUd->=x7rOqy&i-3GN2oD z8{Z?EiG^~Ad-&Y!%Ey%lt9oazpvd%V96*yp1u2 zR^f#U46vKwgxGYtfTc)Lj=40L1LITZ9#c&p^!hi+$4a=}^=8$_?&WyFC_5*PXkkB= znuM>0^KwBdmZ5=&j)ofK453hG-6R0y?q%h9Kaudm^2r0OjW$DUpNrmTR%GmX>IKd& z!4Yoadb2V^(;~P8@8QK_t>SIhv~<9k>}Y*o84ASZx;PA{-rfPv$?C#&)rt9hP)4fC z=vc}dngSyW$4rQ|8S&A6H&(JZH#z| zeEcEC)T{+0&1W*B3pUnO#n25bK`MO*_X9nQv590`k z8+qfQTBW*o9s+h%$*UhvD+Di>mOE_=;NmL=pkiYZWnd3ITA0n{7L9BrmBSEm(00Px>V~;fuqIIEfW3y8F zCp084llO;w08VBNYZ4VR9dYm?f-EoRNch$1Adum8a{q+ti@~<_f4#Me2NIw-lxNe* zagpl;bTi5*sm0M0P4J0p_lizAa~4ao{?!48;wg>UXq(2Lp{$Y%yly{<1tAY-I+zHjpP@ z!a9x-DSG((-wL_6QzNmR`JsG8*K=lIjbxrG?e^$eqeDsS4ra(lRt>lmUNIKVy}cj% z&!0Y*!4&BR(xa?Qt6|xdGz-_y*zGH)3MykD&?M&81HLkXu0e>#2@s?_LhgNU$OVMx zLQnFH$lXh3@_JpuPE!nySJaD?-D;SeyW{?Oi)c6Oc5`jf)$t6r4q2GFYQ&1W77~3M z@i@R6>`J1*#Y;{1-U!Pnt$MmZn9*ObY4Oephy!k!0`m5X9YIi^5vMz#3C8{iO}IfK zr^*spOH56lsY_tD4*cc093kpqh(?kEtOs%OmYRQ~3R!606Q7adHvKLw?a~2b?RVw* z&uyxsW%9zuvim;p4Tb$YY0SN_75OhZaiuhYIP>~?OzCNA)0Iqt@<)q{T+qoq!x5{h z&z}(rihdoGrE@>rA?h};Ev(y1>D}~tm|e(l&{h1YYbAWwz{efdlv;ei++$s2sa*bu zp#5NTl?AETE)DF20)zDMXa|ASG82n!(`Vhjp? zbIqN_GNaq$V!)f8(pBG5OZUrypWaqFz=*6ECw_jIH+@9)?l(q*k;D^0egp5?_RR+~ zvEHBh;4my&+w`Gy%iLlGMo8eEnzJaeG_s*aC+4(dM6Pv-p{3M_z>naD4nzKBrxy3Z z%9TY;K|KZ`j*5>@TLV|D6bo0$j{0={aBg8{$Wcx5@S)UyLMnMZu`wzOyM)aAm}ay4 zSzErVbtj7UN`+o6*?~!CTob2$L$JB&mmPVw6^a?`1RJTenZ%uLrK7b~O(S)oVmTcC z=PRD&=)b3CaF;e4n%4h(_^U!J7jX$?3Y%lORQ0XCky4Ey65EY&If))+O&;5LvIPyJCy*d$NSeGW*M;J$_xVND-yp6d{6c*@_9wuM; zj`oYYxWulPsuX!O8QS3Js~M&fD%egKeE?4G_q-p>cjlk-y?_)G#(NF*w?p}~T0Hr9 zhu_e;11q8$22B?gF|nt&KcuClz8tau+kda?rY_Y-$(p16Ulgo3o;0u$u(??sp6ff$ z@X&ct6$ThS9$<15Y;iCPhsU?l>H$u)0EJ zwDs%DS!`z23btQj=@t3f_+89Djzc?`+ecMKi26}@Sq2{VmY{Z-{7avGKe`>=8_Zxz zbb!?R5Ru5DCY+WoWqhH6@#obYvY!jYv6Y%xd^Ws<#dKZ%zUKw-(CYAE3NZ+O-no<5 z$k4Ubp5fO1ka;b>k3auPS zPMizV{BkAb)zvpzU&BeGTQJS!EC#(+M}^Nf(Q7yTDyHk>P%P^eXgpg{VZ1P)B+W09 z{azc~4#dgn+Fr*HIjMBE_^ezbs>5ceJbXLXb}whtX8bY}rpk<0fnKdm($qrQ{G=o|H2geE_n_@bH#9BM#7&DXByW{ zF3tmbf-FzHqZWS0&b|~gIi~V>B^eWPiKxi$eFbq*>w7k(q9F=~9Nk>vjx3k4tgdIyCh(J)X1rO3wsl=Eb2C~{(zPoAkVEPvoxAi8Kg*0V|iHQDDE&T(ZxsN!TsRtydm=K%;H&$omPEu&dN;+p>@yT^W-e}sNgUisggLckvk5cGJ@sOOf@b9+1$*I2RADU@ZYT>&u7lNB!%u`%$q|*TZ<1p z9-Zm84Rw&Gu&S>l`GK`OtkGpct4*||r*6zaQ>L6C>|tJZlZC~z4rYa`RVUp05P*|0 zJ?f0HpqozRg_zH5S0PFAp-)3$b*#bl7&FP~Cp4Pl9{Q9gIFA*C&dK` zYxua0c7IO#(F9SkTN`WQ?xz|rx}U*_MC}i4LIwu=@8uZ~>Kv}B|J7sWL)JI4(ah;t zsiw)*#Be{qTnreLZt6y5wuBBI(P3ihH{u>{uZdrOVcxv+jd{B~tJcZ(l5ttd!FQQFl|LM*?Hj2tS|>L1x=#fXSt%-#|mZ95lh z1G#3>O~a4Dmbr`Ywn|Fv+{Ov@xa=;EHzty&%?X-5a)iY4HdI%S1W(Q%8T$tP@w zP&hapmpAxHLi`G!x9n&gGPGFlh16TGjF6mnY&)zsQMO~OPn(;8k6BL($hW2 zTvn_*&O;V^!6 zAuJ2t5s8iNCNv#)vE>VXA^ELz^Kmlvf@9ae_pTl5P4|#(+JZ%qsCha zEkA#r*U_enr}`p44EjZl9U996B%B%JjW$+3ezGTW2&A*C1XEL5B(T+GIv4?^8)e5c zY_8ohn<2f%@rg!sTse-WN0{YQa6U%4p0hU#>>1j(MLj-Ay5_NSUZ%BdsNA58Ta_k~ zPd@LMcbXXA=!8|nPcwa1e~)irY=ccJm=MP_u>F591O^&~fmvw0G%eAzm_c`KVcLL| zH}gQWteow7TZ;H->nnl~<0+47vt`9ffwj8WnJ*pXZO7_qQ}FC)`qd6ei5)BS-n3@M z3w?`~)R|UJdGPD3-MbQIzM$#U)E%7hml`asodRZ~#g|fg0~M~9qJne%KX%Llb#I+8 z-ZA@$jN`& zh)`lX3<@ShLmA&4u#L|f?MLylEvF~sI-M}h%9iJpOho67T3D=7``^5}dp@^J&PVhe zDmZAS>pW;5$QpAdluMiEv~TVH1Vua@X&wB^u8wu&Ak6&n@Y~C6$1}d=VEgc;@S;DS z>CJkvL7<57wsn(~8n{xq{}*0}yVO_TMEu*IlCVE+ceqhdDCF>C1besP{YZ-8(n`2v z)qrtDEhgRd}%ydFM)frv`{Ny zf-+d(yWUj^2WnSApa`K!s1|EXw>5)N^mDe;M=^oLcY+k$x^)=RFf-D^Sn3wqwCr^c z4QlTAIjx;#t&PIh1R>qk^t1}LR)kgOQp9citLZ8e1{R8W1Xo8mB_=+kb6k(wc(L$$ zTd$?J%`^vZq0@6S0jiVzqn8|2=4e-IFt2g-WATY1!n%ci|BUOqj4__iU3+cjG)rV3 zKT|%4QqNXg4a_Cz@Kwo=UnP0bCnTnsl7rWVeNWH2yt;$+`s2|s!1vr-*SZU z>6O>06&FDi1=PrkKC6sF!$LTTCB16!!j3zE9bBJHclJ}qQ&VqKVteeZpVNVr7Jp!n1_7V~0ZH zvh;z8ArVmwVO#cjmdKOuy7?myumlB^(>j6Zsm0TR zJpVO3kwh9w(>y)Gk&x{In;%ZCWplDDbZOoO2F~|yHC%UtENtZ(>9s7&S7^^e;p?2G z3a6XhjUL#hiD5>hH%&bCzHWluQ{bK!gXCugmuAhVt`8ubIO!tL{ zYlAMTIZ~ODMIBFN*ukj8^-H40RlE|D)pnISutie)+?@DaOQWO+e3AgAvy~pMdKV2e zfvU6_!O@2c|M}6`K6vMS>`u4`M{ty66WX~9PRbJw;IZM|kd)5*g>p<>_2xGQR>W%r z!tgxH1SI)c$b(4UkJ9CJ*XxNBhY?gZtIhMM6E?vj@?mI+#i+V>fy>wChUGZs1<26D zAoivI+5PTpKcGhn>Z#w)J3>g7E%cM}>iL!f?ARRuCki6j4!A2iSy~HC&cDB^*76U& zuTgenkby)FowNnkZEe#~VnZ!MRZN~R)@^$+&f-8w9pzAjk>YglDOUUJH+E&5^KqsN z&HJemdG+^^m@hqCxTq&I>LWn5+C)8ScJYgFC-Z64RJzhBXnaM-F-q~;B z>EvaxlKj0=osabDeY{6V!c?f+&PbkcOx~ur=b_>5;2_Pu*vG?~-2FXzroTHjTHbt8 zaJ@Z)`t`uI?Y5J#*)mB&#)O57(|4%PVJf;`Mpymge>6r$lE3GW57~%e3MQxAo@BKR z^FC7;XpVX7VFJYWR$nQev`+f%(BFYHc{tvY1|GY>_U#gvg2928R#!kP>KF&mumo)? z2#e9Cw5v-gzs>wZKe4KCA?8a)gdv|{!`Ry|gWa1Fz_Ffc(9&h~h$IfjV^6eY5*wGm z@O&Ywf9_Z+##Zoe5Ud9kw3z^ml6>;DtM>un;n)qxz6!<`h(~WiSA|ux(x{c*n^t+oN=WBab zM&t`*k6XICOILBEP%cwR0!?s=J5ydG$M-^}?-b^w+Er>R1x>Dva^@&< zvVFa`bW5GQIm+Bz@$o)&A1tZQef?m(bxMLpsminSaq`-%J_7MgJL6__Lk8~|x^BnO z4AgjNN*aHa9^f{d#ETD+A&|V`ZXyt*Ft~^?LQlORM}eTtFBWdCyf5R;e4Lu{gmwNQ zuzNi2Fo_MWZNna|kTldjYiww0p78Qqv!8jZ?uWeUa~DOM4r)lAc4cLPTNTL8vcODvo|JsB{CEiz|Rh$^c+*CEucN(dT2zMAJTf(Rir3heN~h`Xicz)noBb8D?FZZ8E|PR6MMdWT^9#$O^907tk9vh~ zd}O?wPKO*~v{m}-HSs2xlzFLCnCzdy?7%)5FNhOg1rWSl#OMeyO}a_D!vuqhLqzJ) zQH<-}QfrSm5!DASl1FspiMws+m>q%=+%GFMsXd9>+u7f_zpvm~NyyeX==|01Or{xr zkDD;`ZAGgnQW620ICZ|D8+AGH(mhj;7!h=Muqk(qj5od3Lh2u5;k^ zio)|N%1=RN6h~XY*Rxh)C3Zje z(L1q`34O^|dottX(%>=Fbgg%0mj2?(7)(YdXfpc9!qV{`v{MFOAg#im2!pmoM-Z$egsb{=ZCmd1o9(*;oFbkPg0f| zdNvny5}pOX+qeXrWc^bXPC6vB}iQ5C+uGakJ*9I z^m#Wc_zPLw{^b^$vF!7+r(hRcBMF>rSofJqTUYGNgi?F}0NUF*{vxIo121GiLJ zme2sSR2BTK2Vc9X@WrR;jG_==(!+#M1ZWaIc=}ob8{4_hkEb9?PL@ZzF?F!Wc1gDT zuFk^ukp%-0K0rqG_Q~P3BeD-3WSsn!zzgFCJ1cSpxY*V(-yi)Hl)aWsYI;D?Ww;0(85O-00v{u^$S)oS_hpc zOqgk|p}~2s2gGJG(z7q(l*yaw2l9ewk;5^QYTD{0k9bn}12#@R4XhL@-R&Oz0aR;M z8CW;iRes7<3_Zks*4J8AV;7HO_m`wV3oyoik~z?n9nC@yC-8a((s(lMIz1Gh_5cu> zfkrXVH~9i!-M9z1P*DWZP2~Hmy^BmvAlS_tvd@Xq-oX*z{Mo1UGmnZ6p#F|vKqrHH z;sEe(HM(oO1>Xx7_R-%!L{>0P0c`n?>L4&9<)xP9j#Rjt>b`~0yu+e zA;J)nyn+^xoxF@2D2BisC!`T9?cZ!ID58oZvcqAGSKy7|vWzhdI4l}ao#Tg!SwAic z6T*}!fMJ?@-0Suz2Z4qdv*dEQK2KeTq67DP6T3S;deK$`!uMhYj!?WdynY)K0r3lF zP{knoAr*Hv57pPGp0=s$-Da;N=(AD=F!UAl8 zL!z&oJAH&8g%)gh9;@CbrY7PKudV513YP+$;qMW-aO^JfHCeDW^QZXmraU}@O*)uv z5*rX!FV%maCWH$P#ShOA__e%x;k99V+$A-!2;pb^+!3DYV|FYNtVzbf`Zrmdq%cIN zYBEe_$hDQS9>j_wEejIBO7uTSLFUa+8aE5J6$5d0oZc6bqbwAsQ zc+LHlR-D?BC^S$sQtp>9_uB4bdO_Y}kKNU#UnRO-Q)xKuuclv!mA$u=j4Q2)txwi> znw59>#c_gr0;8*{@^;4HXCZ!QAVFu+nyVoD{RRVhbls3v1DPY=^7rK9XWSBvO?;wP zI!q|X%(Y093`84GPAE@aL)a`(e7FZHAC7$(gM}3nxJs_6hd5XGE-Ejv&7tABLfIy3S#LTgPVs-XXwCfb)e(9~TpC>L8+j@vCPirns1>szk~jxiDrFqJE#}>fLv{ zABwp&3AJH(2~VI?Ja@o|*fIHlV2@A&Hbgv_ZUfOs+F7ylGk$VX?zyGZQq4Zp_D8an z*G;4_VdtN(U|y~jNw9oCi+3j5=IukfE1@Fo3Y*}}4 zKw`4vWU-H;USwOgz(CNkoySlVMIAmojcZ*{Wx(6@<(@-gwzdh=Db2PsDUX3vgAoQU z{C$LAh8@O(?B4%_BX(ihBLp_?Z&Niz?so6c zY{2(MKPQ_(TI;rM>UfoyYI2II+jGIymvFhyH6WZM&an&MDTj3S`Y0=zLM_t#?QUH8 zE0L76he&7IAAHm{@Mqgr`*YMO5zj=bB1kuQ3=!<*8fmlgg0*9K-?YZI5PY8MZ1zKR zcFRZGDzjRmoh;ar*=<*JnW;Tj-M-N_{m>YJM{yB=ULyg2%!o+6{gS9m;E8<-DeME^ z(5%({saCK^`7JMfEFhKD`fkxA)_W(!IDx@=Vrm#+H!`iZz=(y(eFV$Gl}1 z*H=WkLftm3h>%p=H9vA20MJMlTZtcti$bkrft&uMw+0!3y9DtANw|F3+C!_K&ppsq zykE3=rKR^(?lUo&P?r zqg5X%3z_bY?2!Qt$?7G$2X@=j%;EZ zX`Sj58-bzXUanjtr`AzlNpk|FUeXK@Gdn)Uc%^*|)D@GM0vc~Ir^WDAN8kc6KJzz@ z^X4?55~@X@1KGYSntHk*M^OJQ)xLj!u!)6yfkyN*grtQ7lI4A#dEeqYpl?HDRCo&Rd+zW1h3OfsXJX}Ixw}$Wcnd2*zCG=Q{B?xurD%Er41Mn3o=;oR-=&-h65yd4WoKEmf>~3H7C2 z>|NlpZfV(zlDDYF21dtT{kRfAls4P$`dcI_3^FWPrdYXm(H;dNwpK=MS2PnH3Wa;e zDRsl?4@SLVmCK-4NV6?hSn$Z@M=4jn>x)2+GX|e1kaArEAR}*i=){4=hN$?rn9Y=6 z=E|UZoPcBtq)Xvv0T^>BX^}>cx3bja4SM+%-8NIh^FvC-TG~U(fSrj^G?0Y~9Q5RD zGShH37#vbcDf|opl+})qmz`QGV}#cx06cmAWm_zIb(!=n46zYvbV zAABS#oEEh_YEG4d)|><#dzA-m{O5a7qUs;$-k56trBCZU&w%zyKxV^#Rp1_0o=AH z(ONIXiQaB1MbU1bOST~L_>BQBHYlD0F7}k)dDwRPgX_T=f)5OOdD7fftM7tBVm0=~ zF>KZ5nW&G&NQsVT#8 zw3sV-OR9soq$R`nA^@*o6nn+~uS4@6Dgwmp0+5@3UTr|V(BgaX^5bKu(F~`#-> z>aDaGpn<>GOvC`vM!%U4d9^_QDq1fls;*z|<)xjid?`5db3jS%zhAa5LhFE>q8-pT zujzVP+YM=Vk$6HM_AMEb?19(@|9(-zcO0>V@SkUEU;lU>SghR!nN~Qa@P3#RJo{2Z zk9K2vdp`UIsmlE$bOO|r|H!pLUBfm7I!MauFGcsa3ep%RW(N_w$p@fB69Z|nO#$(y zhZSiH@V!p;oNXY3g58 z-fz*X$%%J=r0Vv1dERU40tZ2e)(#G(lo$qIA+#kg32D(&|R)VEJB>3>- z(2b3I=C;6W1AGYpiIXje1V)nqhiDo$e6&DqbEj$%9fP@7;Kq4Zes_Z)8?-}6CNas+ zjz=}@8bQU=Ir+m69K|Z&cH(`f&IYh`CI1TILzAtT9vpGk@fuHGh|M%!Z3D$nbD#tF zj~5_R;JkfZqJf!BT~5yB6gA3jKC|V)>{<%=Nwt9Dz<*(v)%dqu1%V5p`TSeMkdZq? zLk%L`Lct(~>{flg|F`b-XABXZKrs5G>;9d*Oz8(F`gpkg{g3 zk(``$7lQHqTT|3(~!6JS9pF9Y6rrzw!4r$kgr}am&Cl!_TKLcbDLoZk2&ADyak7dN?L=tc!!5FR z0TA<`%NY1NATiZEppu)x;N5eF=-=_oy8b+Xdl$cB3uwNP_w=Dk%->fng}5l}%3~9A zK47_iGT!St5-? z1Ih5pAnA@uXbUWPg7}}EMB_A>03u6l|9D%JLaP3M42FSu7V(Tk%TfW5RkFG1 zaQg}w%YY*0+SAa$Qoa3!A7;Zj3Z(em0HkVeAyXJB+6FLYptfWUi1`rE2Qnx9z4PFS zuc(Z@HSJG*F~ikNWdZc{|9OHaRB|U|piEb3xh!c1SsMTfUY2e;P=oykB>rdf$ss2& zfE>EpPA?!hA6^EK<^sY|N=1*Owq=w7Q2`08@m-w13lIAK`w20T$65e&Q@&L)b738W zv}wO@9Oe9fn6q7$5RLS)oM8AZA%Mx~ceVv>Max&vdItgJR$MV!mjVnq3Kxh3;5EA5 zui(>m$74+}DWvq~`eM|j#xYiMHUHMP%lALsKaKcGJQy6No4WP~uQ{912$)4&9U#F& za#1@EoeHdrc?`F;$|@kjcX&yO4j>OW4OKgUxrY7MrU;5B+KQ0n*oJ~MQpb`f(-iQk z;;RI3Tf1fiyefbtV(m047zI}s`|nr|;me?QZP_9VBEEe#5vA~Y9Ap>eM ziFd^8z|vce(=fDtd%KXjl!4U9Ut=mT(MBMpfIQhB4~v0+P6uNF5Gf$n#)RD4YAs-Gfy4(oo3>){0Cg+3kQTkWD@I*!dF@nezvGrZ$4@y= zbLD-k{DT7$j=fbvgX&VER7HR5{h*EYpJoC0-G8_nxC8AzU4(w1s!0xn41vJ|kdg-b z#x#&kP&ov0EJE=+cRJ=(YWSfT^p~~#J5mxK0X-FJ4&Jv0K)YfGM3YbPY{t>Gd*4vh z07+0-)G7(w-1C3$m(UGDTzQ-cvhz(~aXrC^tEFxasYgS&|8W(|tKmOT=7oQM)Z++^ z#0KCjA$$hn>#b|WzqIzCtmTia9xia2)dq|Sl9Ji4Kn53JoFPluTu`#GRk(rraz)I^ z?7jxmsiq}}_W{EfP)Cm-bJ&_dqnPAyMQGFc4@5nb&2<7P5lqZy98({>c=np$ry)m} z$(7JSXJ~tT{dar(`@Op`K!j@a-F%Yv^#SBa0AbxB-aX`kQ*OR~p!$cJ8{mJ2amB9t z?kELK>mh+vaq4fWpmT=dVz%E-6`rbk#->Z>f?gbuFfe9Ei96eGfIdXZg&sZtx~$c) zne+i!AXNNkDU_z$Ne0%ex06?@5y&WPaEZwnH0uLnz*{}!*ms!dfV=)55ZvDe7F~D3 z2~Kx`u%1~6->z&H(Mt-kyT= z@4X$~4cn?($P2OKPK5;!hsxe;3IQGIte~N;$_08z_nrD3dF+K5Fl9map5z4%PQN(? zJz)9%DejwqTZNR6Acl1|C1HaBBqee)0!IITbjlJWqG|+w)~+h4Kp~(SBs!Aw??CDG zcPK=8_wN-99-!;ilTg*0dcv@}Vstg361)CC%pQI+z$PR&A1`FRHU4OXV^b^~bBXGfe?xihp48U2h?&YXCEagcJWr zr$FldAkekH@BOPl+Rn{OjU^TR!~(RuEcBBcGI>@$Yj4bqU&^K}zPdHQ?)ex0_*S4` z!=nH)9eGq79>27V)!P*#qF z4w`Wi^Vy^zD0=^Uhr$=P<)=6R_!Y||Q$f&<8RiC|W`_QMoHiMxfQih~_6{`2!PUmb zgN6c>=JUSkN$vViyndjsnn49|q88tDmi&MTOCH;{8>`5) z+ntFWuedol!!{|Z@s?0ve`d%-8HEvwL7l@i$!#=7PL?+lEP)h+>EJhRCJI$P#3e8` z^iWR)2bG9e{V#zYPwRXUA%|H)Jda(0n`H2Ps7iz<8UvN2BNWJ{!Vqa_xj&8Ht*qIEL2Oop=xsduRw^3(a7jtwjq^6J zJe^=l5_0MjLMoqrno8>!nw*NXPR+>Vk|E07=jHqz_&Y@2i^Jyothulhyq-4k3aP*3 z@+Bl&1sEq#zT%3C9Uo&mC{w6haUs?n-503Q$DH}ne?l{152OOsfTA!8d$xum8bYd8 zRlqX7;+kK-yUDh>Q3L!nwza%2${P|3T%;9OkJ%rS9kfbQ*1n`-(HGW>zjm)E8s-a7 z8<@zxTonT?qX-H@0L$BX^?V3c5=+%VUnuKSbTZitx^wLW%bV+<+!bw`B ztfF4NJ71^ZP(A=3ByOC0Kv5rvL_5hF7oH@#95sx=nxgizGvH*E>(p$A?EG95=~vNY zo|$g)L&oynrLNB0aX#Ol=lkOIXak~G-6kJijmyQv8%D2(W#r~Irbp=CCHv3>yAx+z zV6lZ|4bq4*wx{MtB-@#k2t-32<;J7jxcR!K*vqdkgl5UCzHbO|8%PHPhLlZ#r`oY>uLcE$ubMf{t}!RX0 zt3M)9ObM+f&+qox4h$)98(gfZ^cBjyJLA>XnXsAYO+~V8mkvbZm#*zms$csO21GZ@ zE)0CUm2U&&=){h}-~`YiY%{m1@=0+EK(x&m>5l2-yu#N*kW^ z+Oa!I^SCaFCakeJ{UHHGLfuL_x%2dcYG@7mCRJ6aN*!#zFTlZ}Q<#<|z}$Gl%)dj_`-$ zK=g?mjsi5Vr+1*UdM|_VN*(32?Hz=*m#2nHyy5+|e3S(^=+Y`2s+ zKFl?}jzFT0H1HBFlW{<2)tTcjDcR;mFa6BU-+MCdG$8NpwD|*1~GjDU-lo`z4k4HRE|BiTaD^6opvra8WMefinll-dN-{{>)uS`Rk zIp+-FFjopvEnZnJnJqy)DBo$$t z@NBbH)TT9#b%P7a&aN&_%$4*fB{Y7iH^9?eobE4@@Ti5(N;@w+c-8FqtGeJiyz*`E zbpe0xO>GzA3F+eVIW8oJ20*mGd@2DIW|EW3RAy%Sp6=1-dbuBqnqrdExzwR~*|QH;Pg|HPN~rLHymG)^PA zQ_|xY$J8p1*?_SSm2>|Q-_+`gV&Mq=(cHD}rP9&dVWdOaHErR~cSL0mm3h9ov`8nV zjff_la5!RUlOGJ2@b|B=92XjbvM2G2D>Ub~fP79$yYxaBqg6%OO+3jTs3@kOQIaqn z16ujw>z&@u)xb=k%%p5^hLz6gDCMQrx%~lF1Udlx%5>Bn6MDHI8v-+DhMD1W4OaiD z?@V64@+LYmvUz4eVq)o$)xZx_5mxYv=S=kDkhc`274mn38gLprx!QG`WdMgq>F|?W z&NeVj$X0biP5X1?HhQmC-7d32vaf)4ef~ho?#ekvHYb+TmGY1Ghhnp?#Exr996Lq( z!wUIPJrz84nyWdiM?KD1~(|`Y;DMI+-6{bo7OGyNmgE<_>3uYHR@3!lK(rzp}~k*@@fD38Xac zT5jD`Oo;vTOxVM#ZRcDv5 zq;7xBb<%rNH{H)6oy41QKwK+4m8?Mh&|#7{fnn(9YLeP^;yk5j@IlxLy&~`O_&_oIm8%@0sdEztc6Qc>H}(5IvlnGd$pp+g+41tUCt{?oeOWd`+5KkUzWX)Q8)un? z?JBB|FKy!4EL2su@l=dC8eRo76uinGnxBt4n8=5dKQIz>xaz&xFO1?X{F;7-p)SX2 zzt?=>;s89Un$0%K)*I?3BTHpM`37IFdu{*^_n|x_3O?!v>%UI_&BtQ%#_LU z-{fL(?aOA=RhT&Z;89vhcFin&(3n4EKIB7C8k`e3+9WyMX^!JV(W|Xizp=hyMNOY9 z&|t~YU4FBeK(lwY%701IW__Gi|n=1_>&wr!7Ts>Auv_rX=EgF}8#K z#tzVD9ciu^DyJ42K-I5iw&B`QmuM(&elPT z(#2=(nH|#1pV?)D@JMI}rsA_#{sNOVsN8H_@Y`c<@)K>d^5?!b#u|I(L~^l>^J4F= zPqkfD+f{={&ySvxzM9;+=-T5p9cffN&kc&&Z;DoASDf&@nhuMgRGJ`4Q>|AC3~*0m#eiCdbOM(N>fC5ivnTBUCdXy=SiSMmr5Snvjb5vfLj0N)Euxn&r1uorKEo?$=zt1 z;3_+>wz_e1J$tiQp-9}yd2Z2)o`P1u`jhjDMmlk&hi_L6Xf%!E&=?5_WRKZIeqA|o zlPB($E>xZ0QFqoy(8}8nMo=e`-S1D*7-v5T?5wEPrt7D1Dw~%-osE(h-thN24))hx zhynIFzsOi7675G~&GvlfYvbx_O7}t%d{)B?(sA*-1a0hl5o%?joAD(`eZdWazd7;nV@XGOIPZ*=BcBgLo zB=P1FXo_wEjgOBGG&N^g$@7`==P(tXU1<9BX^!VhG|>W}ta?Iv;+&yTy|y4VUEip> zfWMb)KfuG6P0XaFgV3|$hI<-sek}m9n%teRo8B4aCJ6->i_5-g@UMtfD#qID?zJg` z#19ypmx6Do?qIL)fPc|Dk)lmEXNfk^4m>=irMvYuKJ%&qRTnEir>b{PW@hYW&ywbf zUnjGF3eu+4F)7u1ou!3pV03uqUCb=AKn3Ss&liJnIu2jn8S_+KJOH;RtMygtaiSHIA)(zz^Ergg>~$Kyn;rt9;iqOc;1DNRhq(c6&C zco?%PiH@!1=))App0nj?-@|ux1|9})JUt$!_C_-(zf{N_O$Vo`2ZD5{wll#vV%8VG zG@Kho`n343gNQ~3YssuLt4sBe_wp68NQYP4fVRCJ=cWr6g5V5G0*PKn>L*t1R(ZtB zd)babaYvhb@8!A4-HnEGq?YOAb<)rZ2fs0m`Cx>)=<%G?y6YX{$5$9Mqoe2>VhCR- z60)XFI5!*sacK2{8QXwEo9zu8kEz3zo-cz5B?m19(}sH?SF~#9sPaBG`K{Vw(eZnM z#~l$_)*lo5KB&D)Oj#?#Y%a3rOo(6Sw6;2I%(upQFjR4%I zbH*mLHtQ2!z(9qjpJM-=;lf{nx2K@ zs*uBjMtwYW(n!G@$H(IA8<=$&e6vM&zXud34Z<3Zh@_7E7Gxt>JsD-Q{jWCE%JThtg`pAkg8V+Mz%5vs= z2kU_ktabAH9{krGs%fHVQeU++dF=@(!=;%c%?Hx&5!sjxuDUd&GQ%!EaY$}ZQVcKe zUx+)a*~Nq7dvEeprj*@fWL4_5RQyxbxhG4H=j;zV#HQDoc_d-Rew}9Hu^Enh8@M94 zZ*Y{2$*QwQ9xT#8GtS&_<_fd?Grl+cJzlU}y`yaeFujr}v~l+0b{QL%9ili=>XgY( zcAoMoRp!;r@--ORoh0M*ukg<}Qc{XJxhmm!70)cuB=jhl_-dsVmU#Q~#K4&+rE*Nq zF3X+9?v!oDjblS|qv6cN<;JvPZPzsO#%upJ+$hS3!TzS^#y2z0XWtO|h1FF!%v58F z&9wH=_$DfhrR(`OwI?3-_up(N&}U0!ot*I#N2QUI$=#5-rREIhs1RPvr}jnU}) zJS(<8D)L1(LvnCV!@GFVTC#04?%Xf?%x=~((YQJJL0h~u|E@5JhTGr(o4d!$Bu@2$ zam1kbK6ej+Si<;2pE>zwJ0Y{{UHr)sA_t&r`5pG#8Q=Wbv?A!89Ny*m5=hJ?d)*LH zoHhqU;Hv_M9Ew}NBA+qrbMUXYs;p2`)qL(c06N5ijr`* zep{Y~H!3d84|9h^y+;h$jK)4Tv?J12jW^LGm<5BN3`pN=sn=%W!|+b1M8*Z7LzM5b zBRG$I+}`ZHqR(J%)Lm8ZyH_|=M8|JJO@=7#yMp5@NnGZ=bj?Ndy!~@K!6fdE10VN4V%=**C8Jdqo~j?691Y9NG5eV#Yc{vJdu~N# zBph7izq#Eu-GFhwSS99l0_l-?9E%&1ScqN)>hhj0Lp)n_WHTBe+m~KPw};9MQ|RBi zNPL5HG_nxY8f}B9rQ;u=YuF|fJ`_Mp{bF+Z=|OWK*CE#dOZG@^Obha{>9fuljK>eZ zmdr1~a+>-G5zt0VmK*illTYgRXCt~x%i8jQiFiZv><4;b1$BBddfrxTU0$m>h;*b*t zX{j!0laJEpJX~r}6;J<7=ce9o3jAMwuAy)OJ^vJhr zuB#r}&7B!4P~+)%jk&;}Za(o|>}{w?qzagm7;Yt$^y2JED$v|yEica{{oq@AKwJQ} zV8gv=5)N~o^EaWy&XP-bx0BZpRuh3r`^!|V9T+e>QkVArQgNK@pLKAYcrJ}=473uM z(}N!eosFCdnlNVFgcn;cUr!&drrA}$j*c?W5i6wOJimZ1SSJ)HLcR(?!zC!uv)CF) zX7q#W((BC-NUlTIY`kgavyKNx0y%acNk+BOVN zzlR^uXYLjl!`;&}YxwI#oy=x&XnxReF6ALAO|FWgy%nkO`ogD-%(Yeqo=2Z*5BWgk4h$8!tQv+06842d0(1oU2PLetx^KUK^v83WMu$`SvBnITkCN&gFadtZ6^a z{f$y-{&4Imo)glNNYM$Vw+XpsWUl04nH<%wRXF;z8i*a7>iGGtm-}MpK$-rl>KQVt z-V~J&>iM7)!u_9|MK>qmSXah96>lO2J&jS4;Xa@nUrLfxhx}A75*bwXk>+>ckj1Su zmwqwjM|`GU zpE2vpR_pr9F2YX(`5)+E(r%s?7@q9YBoET7&c2$oLTI?iWDFwW@jm~M-%^Q;onp_F zCw8N%qwj0?wt$LG&VOcWJh5b{^XwAkxH9IshgYqg9B8d%@v7S81Lrtdn(-}<#yoPU zH}a7gX_5N%qoLq(_$P6ENQ@h+FyZEwE=?VB`s27=VhZ}LmqN3D6w$ZT(%8t zj1?k$bEAm;{0T#CSw$YaEqLP%Am1Va78~zHEXVsD-A1Ga-h`aH(cwz20I z%H-1M1=c(DXd%U6W z8YB6wkfp(b{eGOM4x?M`hRJ8)|$l=q*X-?hrDdgh{ zJb4~kZNb|%F0X^cZHJ%L6+RY!3ocbzh!Q9$s+rcxPwBUHkM zhC>xU*o&@G2AMu3)0V!Xxmrz31)P%t8PUY_U3M zivgiyrx$v76KkZ0$dADXgU6Rkjd=x{{sI53gB~X zZm&R2MyQ1AYOCl4xPPPF0P?e`YAfs0z4rtLj!*l;dF;#IP$TrWQZ=`EwLM}Z(k z2cOOey3eBE>C{5km+faVZn1bA#SfmAbP<%eUzfo^16~acf`fu{1K%K#ulRVA=_>Q3Uq>u~TfiO)&TkoaVGHmfa!Ob)0rk_e-mNb|JL)4OO2 zeJrMujC?%B&B^cHyErc(8i*ep^X+O-@x00N?x@nYkM~+!P=hfvzxeCD2x;G01l#Y? zu%br)H9XQ+s~9u+9BcQ*y`wA%XhP#bo9FH9P7tz8=689l-;Lkl^;GuA!%1!^Tc(~+7~nn)fI|$c*@8TsObPZ2Kdc|Qz3cHw zV1N6&*~{ z1;9}CJg0AFK52U}AhOez{!~1#a=EA=7OS)0?>2mQa10D&3Ry;bl^ZW*kA`}_^8ASjC zqILSU1xC>Y>jbxy94A0Hk5AM2*`^K{JWhLFUQLG8Z3|Z8k-+GuYB}WNM^VD*|BTji zXtcO(I>Rl9PH5=S7dabzci_pnfRQ(H_`?dnO!D!M1`qjD$+NT*(UcxSB#_&2=4m!>W5#S|61#x`h+uC2WBm=99$j^I8-{*w5(6t2DVg(;LyMA1J7rf2o4_pt6Ht?>)DlS z#Z~^P#;!wBaJE~l(|V=X8oc%9-lpWy`$rG*NEY3%5axzVL^+8=NLJ|CPu#Lo&(>KAe@ zaVR0#{1yzY00sHKE8u@CBfC8yZbJ{?fBArzANJpy4*r)u97^el#-S;i3{udY@_xP* z94J>z1QX&F#W~LJB$-~kl{bxi{LlmpZz=!V6Oe}HfHZUy%13j)ckLAJt|Be5z)Fb}iG~$@B;mLeXqIgdeva)5;K*(6<#b{cZ(eF(val{ynO=R&|)Axx_9W*UDv$N*PkYqHS~4xV{O2mE9vM&ix0-#-mBBR$~N1B z@Ar%bik@bO$NJ+ksL_rpD0zKC+d5d5!sdHtEgAiU;D;{(OJ2xFr<>UjvcYBgm}Sri zY)u6<3h&wwAjgigL#Y?|Ig(i+=b1o=@!m{*t@Ual5B4J?Y=1HV*hk`_5SR; z(;3Fe0;k17m4nzA_Q}-=}P*80grTCVEHQ^>hvv?abjBh|->PNQjQ4?j7~^GjZGw6v96YyT{?nYdX?j()O{-b2Vnu*)U6j@Rw@6A6Ta%SO6=vg4 zp5c#*`GseEJI9ZO?cgw(Q>TvGcfpHf7Y->Sf zx_4!tImlS*RtJ6ruxd|W)4i*^6E)Az%a%88Hd7}7lHg1)B=k3ywkeM_zF_Hg#0?L& z=_GqDJ$rO;iiprbx@=K9_`94@2co%Ro|2GuTX2*I;sIYHNLE^fYR6?mO|ZdX6FM~i z-;@zR7(J@8UdKA#nU4RpGbx=|Yn>*W0U!8${cRPrfvYoXAarDL9n5`OXNi0&M=cr8 z1pgugQy6`L?#6|9(k--PsaweK^w`&^mc#su)M~HPiOgf<7eTl|T0i_=84hG+9Vs)* zB~)3cH8{?scYJLBXgQoK)5$s?vLWb`6-=7ub}Ha3z4fM{Wm)RR1-6EGnQQT>7!CP3 z4f0I@E+65e*Kv`@UE&gQnJM@;Mr#EjDyX=OiF{&)EV}6*=UH35x8#H@#b5YRz*Fr` z6(_QrP6-+gW;_tKP8I15+Va65!Cur4PnXCR4VsALl=v>f!=OG}Y>MQEwb~ ztVZ&A0+lnp5VJ&rvXLxY1{EKNCl6;~lz856kwx+giyHN1TeMGmo2u%M7N}>?%25vl zn@(oi`-RJYPVDZCF4VNwFd)(sWDeeL^ALDL7kscikgsnr{mvEbSHL(+)?oOffNbNF zke41yhpWB)-}ki~uQ!kk9of zlv%G!R5wRH6IGcR3H~fpySA&%*7^o^qQaIWszBB60GD>l()pplq9JJ4t6lA6Q(ZQErLgCM`gCdHAL? zk*2U3{gtcaKYJ9Y&HUo^dk5PQ{w<3ihYYu`pJ1=I1zGkIq5wyBYX}2|GM@A+tg>G zDr^FDmIURW3>Z@^;D0TcAM&}jAd?Dbjnx?v^bTSpcB%Q$?CjVmr!N^!MZ3)S<_*6O z>r4pebHe`qdsx&d*tAJmoWJgbTLiY>x7^(!AuxgqPWrKbPn?v;o85Fqob2TMB{4p1 z$$b0gz(H2Zd|M^7lZCD=1K+aYJ`!!~qp+uHj%W7bp@I8TdPS}`GcISuC9(EKeMGN~ z#iiaAy_rwh4QQTii&L8`o9~%Lsz0$}J;;k?StBr>_8ooH?j6pd;rbfSHDGkSa^0Mi zMO~^p*xWB6fsoHbhnNP2trd$>57kew)sdu)eXZ@9xIEsIyY5=x3|+^dnV)mSA;J&A zwv0Z8vFrF_-cz@I{A4~CY8Ub#wkwfxW8rmM=zxl-hivb_hq? zlBR39Uf4KyMsOrMgC z+goW1xdu^#!{pSm&QHwyEv~Vtcp{&jfLO#e{il>0asls7xr3k=PZHMD<6y@Vn;wva5y<7ty`WH3q2cK~9ha@X?FMgrU6Q@}1{dh@|EG7^( zjr7!U%49E8zv8U3c#H|1_wHOX5(c6kQL9Hh4(a`$>>BIvC?cv^Gef5qc6ogde)^M! zO}FW~DqR5;XoDLMFT${`9TA^H8<2Y=9O{PFS)3u$X`~TG-OIyDpmXdUF@*~^<}iMa zAcdX9Li1X3X>X85x;mIxMz%m9hR>YTuQSSJwP&9h&AuRcUDS*3sX(M+xJX)AD? zGC9Pfe=f#Jfp3M6Dpx$#6vL-MDtTlcx$2Qvd)JUuE<;L+qfjl2-el_B2Ti8e{<^iQ zWc8<%6Fy=uDTfDEF)U`=+Y}Kd$M$|iyq=^-#AF7x$n0y&?JQCL# zlKa=$vl=EV%=osKwHXw?QXq^JN@0vWvp;CZJoDQP{wXsOTcV%UXf_jmLON3$!>rpS zi8fJ=i6%|T--(GKWMyT-jWTx;5$izIAB zj}WzN4c=SXJka}#i_b7qO7GsU{xUB^XrQbb zsBZ7XH)x%jo){r9F;mj5JR#}?%{4R?`od8xS?|EuDl;u3ek7*voNnhwv^B2VgnW7i zFlltZ9@b|qemMghw{^lzb4pSuTDkZICjhDt@>1QUy7}SKg1;+X8x$8Uo}ZX_(X`tZ zLV_dAPYx^V^X&}=%=Vf9FX4YQL;`*9bm((tm|~$2kZ96BV`=2-{e(H)#+G!*qTlnf zd~;~I+PTV^ElB`Y4a4fycle8weZIw5!O?>RJv?+t^w3KH^5|nw51DK5WW%DMp?{l0 zHD|H8l5*54elO5lzlg+n*75sh267bL;rSwe5I%pT<-UGTA`g8i>N6yllpbSi@48;f zaHo!U@5u_aZFi<#pB><&NJjDT5L1*NPmPy8Dz~{nFO-@dmuU=6A?C6ga8MUWMP~@C zspdMz*H&$`MN7Oqq%xf>i8UI^dvr2{9F-&faV>aza_Q7!H?X%Sq^}wC5%QbU!#Og9 zYcvA(aLE|j2hMeEr%9DIhtv&?A9r1rI~5}*x_rfFs^0U>HpWXZtU1+@1~i{6 z$X=d3|K#r*WIz%^3#xUzE2;xDtdm9WNk*Mtx2--wVNfpMcizp?h#0|qS*KCW~wYANMQUw61+!&;dR;2@%Z)Ool5t5l|^!!{jZ&IhFK&c z`C%hADxS_Izs2>TWHjuv0{l^uX*0G8bG; z*V+ygB=Rhs!@B=WphYpp7F4Q*r-HX!BvxP67G6n7N3HHs%zi90MDv<>^?)dg3V~-s z4c;-4`p+RB>kXDoi z>5}dgB%~QY1cn9yrMn~~2MOu!?v##w^L(H8+57#iwbyUG`yUptW)XAU_jRAwaURF# z|2>=!4i*gC(Rx=X5oHWwAUs7H21`7io1 z*ql*^->vv8UZb@CcvH%f=`%~!@_T{&R4GFuNNJfB$}*4>QcBcY2=SyklV_{3yh&ThAZ%=)IMX%UYA4#;2I}x{52`YT+UJ9# zVx4NIpTsXcJMnb+cegjY|0OBQ)=*5*l(uI*8c;A#EhUejSZTd_?3N2i#!2bz5TgiC z75|J+9zLX?t>%Rg`4qkkCEqytpV4s2=3%UKRU^6%ogv#wFLregjaowb>)!reK&5tV zKOu4e1l$XY?+=O%7EHq7Fcy%q7~XpUy(8}}9};@HU$xt8e~BZ%;6_K!wU{s|ll>(d zvmAz@9Pmc#H4la~^ONxYp)f#4F?dt-Micy+>0hdo0aB66f2d74kNQ$PQ(F@s5Xwe28U6m-OTHSE9V~jGc_e2-lj31(g)2R~I zjM)uEt}CbXV&;8%-_zi?#wuXF5bZ?7-nw`}8bxx-m){jFqwTzreLLgMP3%X@lFOCR zLM{J;G>UBaQZwHYFwi;a$)WbDIYqln*7}OhZ9F2+O>OR%Ivv<+DbQ(uTi{R#2mi?M zZWIYxW&OJi%SG>syz?i6z)o%MTJWk&+8L)?Ot5wmZ_ z*tH8T??rl#ULI5xQN3sm`!n$Spm=A|nq`|fL0jqgRNjjLwAV{%^oZz>KLfs6ivJBCZHnr*d#?m0^<7a*-8GJS& z#5;Lkcy*nqj)OeMZKpa>5YZHzTAYURWI`oL&Y-fJ#Wr8h0_)K_7U=Uey4e;#iZdMX z=X5q22C?0VScIc@WI&Sx#aPz4Si6#Satn0>vuoI^lz+ASdG@9AhC@|9X*7xwtNcfo ztzq>Ma}G1nDx3ZhoB#CaglTD|k6D`C+trb;+W9!vr65CJ?FiY&~5REbHzyOgNY>6M93xa^<=CC?tPktNQJX+FI{Nr-NUf!br6z1Oo+G z;0;D8XOu>30c!<*VsQ1zdY^e{!RS-%2-s*26yN!|HLKUHzdiO7JY4$r8q|E&QID=W zKp#J;K7?6=ERZS6E~%3Ccy{kvv{2}1EwhB1B{8o2R+Q`S&o-x%w@1C@qKUf05AIi2 z_FspLPzZe(M%-QaR>|rUvFNcY{%Awj;>Dey;9AvAdWscF_z0J7+j>EZm|->eg7Xxn`&$ zAdGnM;KxJ+MmJ3Do>CtI&PVk~*ZE6T2P&SKk%Y9t>Tw}YNMRucjqZEPk2bbYl|Tba zlOSqbXb>$qHhyg64*mp_qUY@UZe$J=&v|bDJ8IxYU%JBYFYI5cM#Bn1PFk0Pap&iMaPMZ6r@H8kY=8( zE0fW`mn0JHL#3q>aZxKt3c)4AdEI>pv`qF{&-VW&k8Ao=v06;G9Qo<*= z31|fl_%5J#p?3X`VP}NQAG#qvsPrfF#9%m2)=~@v!&@lDu1I3i%x zHb7|R5G5FUI=nUjFhdj;N4@>{t>(XX_lJ_Cd0WlrUi34FOgN%rx$k8a=E!SC$>Xw! z$3`oUYLjs>QAdT_Ewx>eL3M<34B+KHw?e7-(TWLLNhC(N_4-As`*V>KI$549Gs zd-W^?FUecM=++AyAI~GJ!oreAv49pE8LUv)OUUq@xbX7K!SsM<;CaA-JpaM5uro6o z*~_^1qI?s!l+)*Ae)jK6vJk<74*agcDFSr{PH#k7x!v}2S)M<6r@c)`Cqfd}ne1$u zUrC-=?$Tu&O>j}mpt3gs?k(Fb_lT?x}U(cL6+?7L4gO!rr zt2KW?>7Y2!m+&q@>FA;`$cwa9I~xTaye)pW5K_9N{95nXP}k>?_1x$wRspH+yffv~ zrwX`!n6%cFWBxk#_9n+A<rKu)votgxPTj=_`G|umDi?>n&aYM~r5-&>Us1 zlEF&A9=7_q&C7m?nT5D-N63;nPB_dqu#HF^5=I-s<93{j{eBgi)ApQ8tF}5% zkXu>x&vIKqbx-$NfE6Rj>=ZIoaRQD1M3*+OeAV+FBpQfwzcs5}^BARv4;i8P@)1&t&<^iU%O~&`L z!j+TthmR0Vw$Fu= zXcmU^2>j&sv0-SKPjlY}mI->U?{(U_)p*fX)%z(8rdXetFYE?ALo1rZRqkILX)1B< znssVEbWiER6?%6iyCT262A?i?T>mtRy*U#-aMViT+l4uN${9dgCI1DYE_wl6AkBe% z6xLcyYopaf5LdD6SOT-V{c9?2{(0GW#>lmnh!k=bs@4r#h9m-QmA^o-M+-ehTeP2P zNPKb|RbUE{&SZh%?Y#1szklfD-n@dCbb$w#Jtl2JIY^xa77GbQD}c@`1<4tRZ;&4E zGjKAuv9<|Kk(RtgMuS7Ki;#WO?3s_o6F1ux(OZD>Q3MNRBD^w}!q>vy&A!hvEH$JXwZb z<`r*$SD=94bY@iejR=-XaY|6VQ6(>J=IA8jrF zMuOn>&gfE3n9Y_2p&^|rA?(S8@Ap!k){DCTqGvT|CVyi_oE8f@4lu~ThH<|ANy?Ef zq1o#4^GmWuWxpZb3E0m+yE_dID1{VsCKtjoP&;Cqy5KVP(qavH%6Pq_6x7u_!|u zXu#==Frgb?w7fjMKxX}kHy1*f?N2?@rt~qoyN|o2Bq4p{_ovC!!rB$9&ogo?U)G2> zx+z|%1*{foGL9^@V=>H`yl{Iib=Bv*F*sa>jPD@UDR-jQZ*nlmw&k@L_=S(hGfC~8 z{7q%81x@gE&4+8&D2fxZ14%jkM&Aw=x*+f$*aXC-$B0zK=lR@E=On*7D{4YcoycDF zguFWP^2i#$IZ_eFWpvl2u!>?`LC2InmQTzZDb?Gs*D5`Z-i^ioF>-J{Si=O2n-9@* zWYCax`PG28ci!WK;$t+-&Nne_@Pp%X9`a`Ol^Q@BKE;FL(XF79x5Wh#FA_yVzphB$ z7ZG)Cg#F&2zWfLw10it9#bjyWhHKT&vh8=AMmfM2FyGtu+S`khequbqguBDp#FQPH zjQ7xdE%LuQRqwD%5d*-lEj{tt^tW$Pu7*SjY0%|_97|z(Gvljg7}IB$5pf^@0cN*5 zk7D>w(oadk$@8rZ!?2zJ!Z17xZY-(pjh7Ly5I+<7nA~Drp%G}4pKO|u&xu4)>8-1zZyrYM?M zt|C4MEeT7O|6(62i)LPSdJz_BbA0be{^{0O|!E1g8}Q>YC-8^HP*O^^stL ziATAgl#$V|7?V$s4(l-(*;`w1k#si$6Qs3|VZ7-ID=#87xx*LhD$M)4=0~lf z>@VVYAVozzFz0jO1Qh&B2r9~pcu2ARkMC-lHrU7lUH83 zVF>35P@?Wil|?+okNd3vAvI>o4>M>b80duTxjiWU#>*Sj^%7*_AXEI^S&jgYlkZ~* zU#U*F`E6LYvOztdpPEPKbRxuYEwFBir}ThQ7-zf>`kdY6E~@ib(427jbe1?YrydPC z!D;wy=+^)F6a6f6SDh9Y;faz)IpI1+zQyx9JJD4y`(Nr^1Joo|&G|PX;dc$Ly4EUm z!(R8%`tK)gr;h|Mka$b&N6&0Bd%|Fp9hgX3@Q1jHp($@de1dqU==d<2fK_$6P-l^$ zo%c`Nqf;b&w-a))ijnd|?V)&z;;+aBvp;)$Nq{lQ3Rhdrq#GfEv#xLPCBW%BN$dt4 zELAO88NxUy8Yy=o%R~x>$$wb-1=l5K=}_S6-cDyl$xq8)sFgIagtB*A{ci&=)?+wD z<@-&^;sDdWb67oD1;!ZcL(gF`6O{Qakwa~S^sx{aZfG_6b%awWM^$pajezzuF12Tm zY7j9ZEr-f8Eo5|jZ_05$NQFv~^?iV8?%>^Bsvnp#RU;@Qie6k$@g4Q>>d`_B122hH z;!nqQTwMB=ww9lR|tcr|+YuTTJ3+X?PCd zA%nY<&1rN$u5oVUt7=4JZ>~2DNsjjuHkV@;U)W%_z9i?OwtC?g@)#3I?m2gxqYC*Y z_{`Pso`Qrd`^yl@?KO8MmrhZgtoeN#p279`UgCjN0;h%$4s6}zNi2d>+;!`wSE7#EubGyJMM%%DR;vyo7;4s9Doz)*F29|f>(lMrF7v+zN zqQj!-QF1bZ27^So!R>%XO^iR*2WatQA{SsOABw163I>?x)~Nh0y9pZ5H$5jbAZoAQ zLyjlf%cLf(Uy4}OZ6{M*>%L7(x7j6T1Es(SuymXQgKf4KfE%9O_Ed+e<%%sf+ zzA~;v&lsdlRp(X^^c%K3~`n{TPQOmrtc18X#k{`W4#FPZnCI0 zvd1SjXf6LSCp|Ry|LxO46B5Hfw&lxF!ZUv)x>&kNIf{PJayxwpk?3t$&J**b8xlyj zOyR=ao%;**^?tC5dPCX^P^h;puP4h;a>P+&9Jw@Fg(Zm6GgL37Db%EfbDXxTfxP9f z6Gz?o9S-1TE=v=wah7d0LDi4w&7{{Vn$$ZNB-lv`s@UjhV|fqvNJ|;_JVfCR8)ti) zMVpUVw9%yG7wQP^VnMLUK56|;!GdbsTF^1Sj$}_jcK~uNL_UP{Xf*?*qUjZm4I8X_ z6A-KL)ziayUmX!|JhW+jjdmeWMv^F4q|yPM_k>-+LOeqgM#Qd;Flv>xdp~-@@hU)*3Slw7o+%RCLq>) znL2M+HS1}WNqOKle1_SK6w{iDn7?-FqffH8_zGS|Z4XNGwjX>i9c3EzkZ?PsZr_)G zUGQ1~Gy}A*<3)mi<^o^tvKuBckL^vuqYtBzA<@J($*od~U8I}{k*Fnil(9t#DoLLX zHNU1&eYa>>NjJev7&$78Q>%m=>MsQLI=H|-C7SrsJ{k9AMsp<0^sQ?dW;i%DG9fbY zhgM<%bDv>!i11dU{Z;P`eyo_+=p#o{7Ni=)G<;Nc*R~^=%c{2nd-)xRj%A%*>}yst z-8Om6QrPNJyWe`tPtj8eGF#(1Ihk-INYV!?6LQO1=Fw3`gIaU4+P|@kc&1i^$n8!S z^?vd5N(ZWdy9I8cFgZWV|4*c$8885Nj`O3zjVJx1Mx<&j$8@9^uL4Hs0}OW|rbTp#Ne2X1H*8)-Sr`sRm% ztkVpAo{v8K@?2TlAX~;@qT~#0-On%#T zlooi8!!-8J$B3T(z4XkWeiv#Hh6^Ov%V;DuoiQuIngB6tVPQKoIQ1!w<4;GED z-c}2tQ)%+Rrq}05SMoZ$dW~?T05r*X0<)ACPwGY*!F6Oj{2v%d+gbj4M^AEgN5Av1 zW6zhJZ?vclSnboUcnnK#BS0!af(4EEjXZ!XucNFBsKi+>;02cf^$-#wIX}t#GH?~h z^IC*fqQM2p7DXP>t9~(&KIKd=L`Wa|8qD}l|G>(kbdLWf%}p-vkZUHdW1kC#dPY24 zhFbV5Jw^$Zm#dW$P7O=wUSShLb$Du`#MC6N=>)|QjMT|#^N@pl*=KaLo`1g+qJAp> z(&n+jQxMt}_BF)Z;c-uEHQZ4@6GV@;3*fb)uJ<@y>j~oYs&#M+mB8J`dTb!G-^UGH zwooIBSVUCD1PHza0p^ifq9DUKyGWldPLt5TEz%3l8OK>70j&5!?6$2_OU%6NK-fzSFFo?){L>^O#~ zNVYFrc^G>6IMpfJqs0~|NR)io+?&rkECQx7P+5Ky4G*h>$(lrirBNG@1Q?!A`Ia!h3s?M`24Z-;+;;$oErC>4FrH zlU#zWfDzv=`JrR`-;T-K$mz6RMW5<3#r1_X|Lhz}A`%wh3$S&Jrlk~LgeoQrMvW@T zS2^^&V1%tRFMB>?bhh59cnYH6r6q~Gk0gl!@&j~!hOIYj!so^!qlSJYxxMcIm8 zM?oSr@^VMFhLuFRM9GoPd&h(--TecPT&)#EY-F98V2Cg$B34+f{cP^ql= zjfAP?vbY7TN&^xbVY`)21|Mvo)kbx6bFRgo3e9;uumxWrOdaZpnE6@jJQlAZ@m$$YHSS1&B!i3UunPPDSjJ($K z%X>Sxzsw9~8V$v#`qLorg8@!7$c{3Ff^4iC;In46Fv))>OHU_$c5J4^F#K?ChEF$` z7dO2AoJBuuegN3c(hPQfyS=NHilVMBhV<_*v?O`)<_`3#SC_f>En4&oh?SJgd28r# zjx7T^9M0C@AQX8~pRe3_rg6MX8>Ew4<9B>&e^sW0KQ15#{0m)9h!}C7xu4y!M1Lc= z4s~1^gWinlmAq*z0ScowU;kEWlH@!TcpM{X^=+&OX?4*?fAGolPLm*uViK3(@}O#t z3>N;MnFHTe6iGHfC)$2SmrV*3py)w!U%@|sckHZhtHQ0V8jtC505D_nXHFLnIq9!n zOF~ju7wKaoV&J76WrnLHlv1-K^b@Nl=nhyO&^bqh4FBx)H-{=$DJktC!crDk$m8`? zz$QFB^{=G$DCJ-RN_5J!2#n|-udu;R*$=wwP{MAf2aooHkN3aF|MLkR>LNOLh7wK7 zX8DWehNtt$3L_ z3WqOp9R+p;iGaCLG7mpV0--%wD(dCk7x|f@&g)ggF^80+MS*W6X(yK*#Ao7nkx_T$ zmQwyKn%|xe+K%e+jWk8UFk$lzp3mL@PD43}7nytrl)M@{sOP5Md6G>WhYVxdL8m6y zWq()uWiP)hmK0Nf1+gW$NLvrefv4=7*1C}9F6VHO(>b^_ZzPpP-4|w;UpJ5OeK$Xu zKyxuqAp?K-3G!dbH;58X>^nD96%=8AxI3(a_O883=2eIiB<73l0{>mw2Ua>HrH+@8 zIC3~|@Ci8uv4$jY@FU(RH0fqYIeXeZVBu4|HS$iI``K?45bq51})&|pt2$2%DO;oi{qocf!-Qh*o|{&_ZqR05;7aS zlGtd%Z^;mL)MyqJm$A(ql;1uY!y!Vd<_^d&T-O{d>hyMF(4);V!V=UL&m~kmALZ-; z4wbx+lc*mPoA!dt>wvlB3DPHdP(UKa(l+;-yxsd;(k|~qkzp$)3!~jxPKV-^v{XSr zoMB*@x?f5eP#4xfJwSs&i#M-*@Vma2=r_<#uuB==e8NV@>yDyQrOmxx4co{UluhUR zFBX7p0f$;PF;vBH{oG*rT>e=>*g>=38n2=5QK4+G6FEcrFV9CI3&2&i8jD->krsF? zKbgRi?RY#K)*w%R{_Ig?n5ir?9WR!%C$MibDB2G0&+$TOOfc@c7$fh4oJpm@$~{+q>a6&X9o!LVG1JXgDXowAlEullCkk{vFFxY;+-4WFv~X^+-P)+giy%(- zk);r6rr{=!or~70)e_**E7hx%w&tB4V#-undu(=bW15}eN5Y7w=yp1jI`Zw8hvfEo zxlgeJu6KIIuBDmc^L6I)nJTxHFWjJ5AR7)SWV|JSi$DGTeP4iSk-kOw$Y9rBF@NVS zN-;Eqai|0&FRP0i!8=p^L51R;==H*+IvvSSMZa#%m#RSre*NAx{6)P__53|rhXDr! z7M6#x#!eZRhM{hw>3pw8OLPrs-Q1nl(?3f3C~=U;zb)3HBc^xXxh&u9_VOL+!bL4r zkC@WL!G<609-XC9Y*pix^3|2l4e|txNa9j5+qnBskvk++v_G?^rMuPhBH3NZEA zU@gI!ExztPo*mAa30$5I6g!>qy*hVP5>HLZpT` zD%{hq-1t`HN-zmL1biHd3Q@ZP$Aj|F-b>Cv7LWAA8UT~3Lrb{hW03Mju2}l>;#|)mhWs1S? zb$Ev)H&L_|vyON`v{nSi_&_t3yhc9jXZt_L*9yR=98*4LExKUy$LTO7)8lCQ!KPe6 z*Z%GQ3_Kf^hMxQ+EQ=O+F>at`OH7QlJ7X-|O9Hi@J^okvc~I%U<1?ZFd-}u47TCVV zY)1KKT?F|&Qu^^*67Qm55?-s^92<|!Q8b7TGQ_V#sa{h34)T!m!*8A5PW){(n6B@$ z62{5>m<4IX{&H1XMbL>FgA7c5;T_6@qMkTWs{f4XsaW4Sms-Ghss3*kM6FU4DfNPh zU<0)>YtzQb`tRGCZ*(7Gq^HE9e?ywj+ir!!<@3&}rkJyN7ki z&@VWRZ-Jor^ou%Klm0v7$_;~ss36VEzwBoYp-TmLWP`Eq_h#!WpSFb_K(F47M$0rD zlM2}2d__kWab8bo!dm&&0(iPK0ki;pv5}?IB`>VYgCN*@=L+{@0ZPWv(~cysjtl+N z16CkI{CCz!B@#G!)}swDS6fdwf{;Cdv9xlmD)|d~;24|9vdDy3l%Naun2qy3@-fYH zo5RlKOAqY)CNbx2zyp*S;v+BUX1Fv8cfdo)mqurK|Bv(%S{yMQm==|W!PxWqgio|vX$oj4|N)R^4d#nFMUPAIv7Y;%4WF3+NsExVUV9#l7G4h}Dzg z*U&Ig4nCXMn}Iiw4raZJX){Jsg9rs}qhjEMchxaE4mjYR1V+CvF9~}T8ed5l59A-? zM*h1{=kcv$c3X)6*+t;h0?-)>5+B65e-wWpLCE^&A>qdVe=w=2h0*9L`=iOC{$WoL zKCl$p_eqh_qvIsPaxcKt-29MfPn9&x!Mg|gUPBFFb?yzaHuj4R=0PEdEg10*5?>?Z z|L_vnQuF0-Hjv!}knWYr??Tp?xy2sv6> z6qj0u+>;AOlLEWOi2{NAZkcf2+~^-j0(|ayseHg_z}L|tRnQ(nMoW91NkfbJO(jhh zIwggI_LB3=4T`%ctk;ec5~yW2t@!B_Kmryrd;4pB*9ab*Tc}Dj23tqfZ1g-P?s;j` zj0cw?t88Aset>j{L*qbjV~6e=bY^f6?K&+~<~3gShqC71X`(T?sj~bpT2UgmxhLC6 zm?4e*x3!s+)fb?&ax2rb>A4cdT0}(I{DAh=6{rcY{kXze?|XB_M@qX zW$2e?)UC4bd?@nMKMbMZXHwev2EGb~!6uM_^kdzr5z6$n%c(}PE!FZ?g+7+n*kPQ) zuR;|<{sw2H_M-(XQL7RtO!I(!%R?HCml}gnY#AgA!h6Ai;JOY^VrQ``UaJ4;a`b+j z?aOHV^e+dmlW(~2GrajptsJLcXz&SM$RL2oBEOd;kHa-WWe)VR1FiLzpg)d~z(C70yfG9ai9y+qquSxlUVF9}4h&1kQzgR?+w#?STo zSj?f(#EN#)HTm+0alxu@)WZC4P6d8EToZ4&d8I0Lm<>waKK~byqllWMGCk0A(7C1M zqiJrGd z%k<+(Kc5v4Q>Ej*X#qsD;z9sOg7Eu)A(;3O&|9qPG+dq#@#j*ZQ_JPrGK6$odMvcw zEhsdUWDoo;BZkqAy~|_$2u@XcyeN0YzS@t9kiR||eD+j$rofSpRDAobN}FrMyw8TX zf=T!#7E4mC^R~z?)DV^tj%AT6;mR-DP4lZ*x?LWQid(pnB51=>s7!IdbIm74O;XrM zAVlL6mXgc&JB=x3Gl|yh@n^O9D&wGB85wknz^41v=?-8ZkwxrGN;HKitrar1 zx6vm!{HZ?JWc+-=uYXI&M#DzM?)EaDitdE?_Mq53&^4u3pgFb+k9(335JyShap+bj zhM8LOtk-hU_V* zN}p(TSazAncC=mtJdBQ3p)dC5-_Ps-%hr{Gk;kJ}I;}4wPheX1a(eS$iz#38r7)gx z*ms~}xr8Rzea3BXZm;Gq5StiZeSp76{=dLqfWlVEdG4Et>kjs=3v-L#oKA|MYX#s< zBE#Rk7oS_xQHRwcK1r84|L!bQjT?V1kQmRT+7=S@{;6_ggI2+_;1o-sx%6pNz%N4D z39}^QGwJUOJYr{3A*%t`!4fBm0zzr4AHshton_L1eAzK9jX5{|!*ErwPYe)KyrHCw zZAxqt$$sY0RIq|Uhn>m0?Jq*xM}Qrv^QW`U&q7dzD8d8?#N^2m3T=d~KI@JdCM`j} zxlasPHj7T3x%2hj3O#84wk=@81M^FixL&2p_79u)NtLeHJQY$-;a1u9skR(Tk#=&C zv9_`F2p$On`T!w6@0~w`0VVlKw~aUCCuCco#&SyD8nJ{6YY}(pQ3bE+TFO==<3J@% zbRLjI!~Yi~(H$mSJq3jK-B@6BOD=8YpC!6AhunvZWy{69(Cda2o$E~9(*~MOE~*)H zXtr0H^WvB-i2bE-0>Y57;h*evrJXKLotkF;6^48;{`^nU+W|xb^h8|Rio+T!Vwvk7 zvR=xeXVWU5R0G&tKZfiib?5pIKQR#%^`vuar@erT9sj(yvrdpp>bMl4ND?=2;D4Ak zYTAXypPc}(yk3<$7SBUtC`WGUD+>Xe-s}8%w&xi%dxo|4zb#@(MZ_mw8q^s{?9DVn zv(dxL^Bc!r7{u!3XxW{Wlv)-rZHt9uVBN!N>q0DT&Zq}d1js0aePBkHIiWGR%{;@w zsuqX3=_5`;iv^|uo31m5>Qp1*klpEDd5fo9WV;&WT`4mm{9L@sg zC{tP295*YakgkAPK=<#4h>#zWN_n1i;f{VC2~EOgR)Xz>rcAWNP$~dyOW+W2iGDS7 zfPWyFIU{7Rxa~SBshY;g(YcgTb$&1(Kao*Jz-FqHF7n2qcsll)d}e}E{~Ol%L1k&X z>{;V*+l1~4>fN1w`Ojh#)CTsk!Q0(rd<^AnsTcpd^VrZxVr6${C%W8KsH{QrxkJrtRrF(K<%`1?w)d8Cd%9r=fiN0AC8o-S?-q~&QLx)@f zTo#yiKK8kz8gA%R+9I7e!8CHKdJc~J{ial2L0Im({1$8TI2Z?!EkQbGh6Ojh(V3?W zW|7X8tG;Ar5vZDnHGtY01J?Hc2en1-_f1$@ znRGnwIk|MRmAY*?8Kc)dQ zM&TI+O5jJST>c=PpVq;n3MB?Pg6*5x(Mu_JTO2Th29G$D-G|hf_$CAl@X>aJ8;BcF zOKAQI3=pq$83#|}3q z6|0bQfHi==_g$-0y5OKD+_eYJ!1l(RIfj2MU*(oXYJnh5%znTY6WLuTUo4WquFb1c zY0ZGPL-(A|{FTD*w16Dw+Y=iA{<>s&^ev0x@jn#658%JXX$!1s0n5RlM>2c#9T7wT z=)fH^CX~n`XMeVKEj);lKJ>kw{{4gT*=IMUu-^*9#!J>{0K=0El`OtNA>Q; zCy@_3R&9@_!8=S9H1T3pZx)+ta146!Dw%cYm<2Ev=UksDJ@+2L+hcJaAV2kuiRt7) z&5+|cutCrsFDF;NPN#8M^KxZ&9L}Z0mj)8KV!-tz@OkrpS7|4J<8rXFm@ClKbiA!W zs0uR&U|?wny3-^IRDhN)*`#mK!DO142kqrFLBqIyA@zgFT8CnAek)$ZWY%R(9Z^gI zZE6xhYRFE=c&rAAK=x230V0I!M2AZxXOfhACGxzLm@vZi z=YafosBK3eBR4lf)c?zf+dTs&2C*(cH>CBY3n#%Pn6ns6qsgUGK|ok`lr=0!s`Cyi z4PZEU#E6uf8hgDK#u<`#hENU;1XB}MX&a=*et}Yi5B|>K{A^ zykfu{tei-+Qf+55;VgTM_YewwXo(U*0xu>Gu+*|N7?}Obv-#I1)E}JwfK3*WTM_-+ zKLe%r=ow=sU##X+D^WsMtU%0~zu!3F8l@ zf1V)!Z$J!IEoWld(Z%VH&yorYP}=l1!Kg`2?xjmKV?wn}A+X+P)~_Rij#oYjbUkr= zZyowQM%BbT99kgbt3Ao_-0lxDmxF4kBWd;G(bS~l`^t7DC&lhHoJGi|3EBph|G+H& zfhHNEBkS(_(uMWMD;#27_{CDGSqaT_$yd7gy05BjJUDz}CK?3_)s|?F-LQ;8R@f1y z_r_RiVyQ+H`ju{JMzPzQqS=1(FJN~OZEK$#P1fpvUruI2W28MN6^Kct=)|sf=PFb0 z_XxTtu1fizt3?h=_Qm^aUjW0*e9`-fq%{L6fB0F64P@UYJAi z-CJ&eaCbrizCQp!p`|1eA9pD3jZ(R10De%jnL_>b`1bQu)7}8=JMtvF8ke7c#^{GE)#oWco0=)W$xU5G;S1TF{tVek zpEB=;>;lNDiX-q#{1z0Xk7F5(|+G6jv;M@`bgn{L<5p@sVV~iIPT! z!Y=?Q*kKT?AMjUtiTF~U13&ZhBN?v7R*D@2j~{>tE`7A2SnRW{1i))^SWRufR6n+9 z|3zxo{o$w_7qq_Ktc>foAbaWK2=+-J9%+SK4t|P}F2t4Jye>)rrh=75k@g8@j4Ki0Z2xH@X7P$dI6ORoQ#l#((zCZ`C?VX2k#3n7lRGz8-o=C0 z&4>8Uj;7;uPg`Lb)zMF3x2C#;Xo{y5JYvyF)Xf2_{3OGgL%ew`m{2W*up@k&Q|A?i z;`4mKyp&Im6ql?6OlK0lS88~ef6U3oaqDWoV#Ekd9)6a9J3X(i}@u1UbIy^9%y}% zULNWL&`AGX7#f7ZVtv;8NRY3dIo%z(k=3BGVaoWZ%ASAR75>*?3atnF?*3kuwii|O zR?Gra=?m>FA35$$^6OpG@K>UBrR;_cp8#elO*FzGbNBKnzm1wCL@mKr`=^ueLTW|D zh^+-0RJ8Osu=20KcoVNh#!D7j{!>tEWQ9sV zw?jt|-x&}m>s%koS|J+_%~=S$Apss(>T`|6xj6uGMZZ_aY(pT&Ip|pr5KI$xZpHN^ znQDGa$+SP$7cy0xV|U#iF;*bL z2zoqT-*z-NAPIa4(E461j@VY_;6n^rOH4ZJu?D37jP{maC{j5z1w{P`*Nhr7`TQ%s z0Z2*s6S~bE7QIvzZ7rnmoy36O%HacZ8zU0Q(kV7NOH$GAzIlOS(_)0kBHWt&jW94cFK~Y#SVzHi-EKr@3j~ zo~aeap6Vwtbg@+89x!~5Xbec#9hc;VTDA%R7Lf6uYv{lKz$2w1g*#JC@F*&5HGlV0 zY-ElfKnzuU%c+;(5|X$*nmwGz=J>kO0*e`-76K@CC2A5JK6Z`%KvUv0Bij)5UUpcG z2qq%-U+Mo57zi1PM`;?g!S8i8$j8H4yHdb4YS!I!d`3o5Xq5GGH2V!%yL>g2e0xe+iYJo?vF2bX*D?931UCkj_IAdd-n56?Tjl31+w1>AANCOe z=tF>WYKKja=0;0cG8JOMG`m9k9se}OR!)0TVY3ATiN86qB!f}sdxNyV+{j^8&k5su zmQLXh2oI$ePx&rhDezs+kL`@i12@CgCuq!7qSCQmmf)U2d_uqbU4zWZ-Gx4FJ}~9t zjpFgwvH6)nIZRlWr_5RI-GaYTz0=0Gbq*b}5VFLHH{9PT7kB9*Ui2l|(rl2B(*3Of z9tI7lRY5td0s26PX|>sRwTFgO@(iDKkf6R_w~F_aOq~o6VB?f+1d@DGAjV=p$B$fD z^(G$)<{ULJvyuhBAlClrG-r+y8f32H!RUTc6E3$}aW>F`-IaG}{Zf+`_!c#68;4Hf^7 zs&vKFk6s}OEIJlzq65?Ru{>v@0)U{SU>D_uymsHsm9^Ca^q2gmO}(EEJZYk?;W{n1 zjJW$J44qL#86s|bueK^pzvW!x|1nhwo;&WpS04)CeHs^urI-6Ny0b9xgR@R^C6lW9 z%l%TfO2&jQBXi!?1Jw0^i-x=t{(EFbe7{}fF{V=`{;b158v+qdSKi-;Bt`%z4g8{y zhqv>T&>&3Q$&0#8$tYOJ2@*JcZ|+VxKTj2aDkJ|$IrZzfS{}@c>tXp(On~}&x`uQT z1Z--IRwB{pTaLy>;a61M{wX68cVd>Ax!U$4XKy|mooV6t%)j~%MYNCNUldVj7fvN^ z`k_|j3b*)T@^HAcaFIlb6LwG)Vs5ns9m$=5PjW-2U+0l7(tP)gc)(#0)~903OcZp@ z()tS^brF*jedEv-9058I1?NP*z`EH|!=}KYbv~Ck;o=x^F?3LH()G+W_^4=-p-Ft> zq5Z)1(Rm9{HQ%dgBSzHT8ixi!;qoNiVBD-hL$QIa`H|c{{nK|g%-8=g)Unfz_=*zh%dl!>~bdF$JSc6`4?n$U5>5B z6A)Iv+Bn>wrc6(JY*|^` z6PA)^ce+#-mk7@)+i`O1TmQ2Gv_Ij?UMpC+=kuww6kD+JOj_#j@W`BaKo&Eq$lP9!8t_yfYA{E(KE0$qwWo6_EG6zg#z4m_ONXOllN8@%i0sf9mLd z@Y8$0PpFcwazkh>_u1-@_defm;|ZR{i=^RiCRitPs%!68)Y3d=hupdx%b>^UE9aFH z?dpQpOC%MhLnlrh3QUSIe!eH+Qn3gQ)nVylck__GUzaC9t8I$64-(1~k6jr?2$N{* zg9&;wnrQEs{Whrl{Vba23CV9A3pWH>EVvacg_{5S%5`VAD2<}eDdl>%F=I|CnGfFX z#QR~F`D2jF{LMyQPV&+oofI&@p2FN5?wYXkp=k=TxQuVxA32zCG8l(-(4zX`zWeo* zV^3Ki7Swc_dBR$KMO9S#>e zFSSce+FH{}Y*t>27Uu>b2tH(=`gir2Tj{=ZQ0U}Tz@%DWj2UHo6qO%yH>H0zO35907J!H_^Kb6icc1@NMDB2ri!=)~ zEqZ>~u@W1%?$A1K` zrbI_}dO65fa1u!jJc2v_fW;zoQ*-?T#p5zk?#hza)H-+d#nePx_ufy;e+07hi+&k!=))Qhuq`YNmnyFZSL7D(bge8y*}&#UYdsi9rcLQb0OI2@w!c z5J{z_LApUg>F!WKP`bM$2auL-hVEv__a6NFf1dMv&pGe;)_T{wzHhCwX5kt}X6E<1 zWAAHU``Y_HIeMg0_%tu zv#hEeZ^O)yViUHOq|MAwVzwF{g7*Nzt#zyF=`F6NCvk9=ozwZi@2ixIn^uZ?SUP84 z4!$<|^GTs)E3|JtqFJxmX=T79i*sMSPh_lYOT1EY7eg;;Nh_k1t%I8yCCU-`0gD}r z*?Dd*3iiR2@p#QOxvyYd;ZZL7V=yE4Tnx8xd%s&7W!^p#iC%)WaPKydJiOfP2(G~# z?wlT_>7p<(yjgOMi`rqad)g)~lHqmZsp(^Poen$AGOe0%j&yIb-JbHDeLXKX$`>3p z+?n^XUgdWzd*a8&Wm`+EmQ)_$$qC<=dCkVI%6GdzN3kW#u3x=mqxV;0)PYEZwV_d` z6rK!;m}Lf5AbOPuQ|IB6$A@I?c!5PE7M!(5Ej8c}9*=UX6E-HGjnerjgnog5mK6P; z?Os8g9>}qK*J&39=$DeTyf;_Lf}U8-b{H8)GG2R#3q5*WWJ)JPB5bxV!QJi_(=}an z)XJ6cp_9N8Bn5;8zJ(mwR@Mu+9_nnx1TP6RcdY9y^EQa&KetYWzKW}%Km5Sh^dzsvT= z7MPTV?tX+|8>_1NAhVHuLCnv`#ckd3+g79wn2gz7Q8D94FOfSW*Adcj)9S9r+Y4? zC}PXwqC27-b*L7-VQXV1g=~ihqf|7+8@1;(W7>KyKU7XfC(p%3N?6vr=Aej0sEbyP zu&H7t>A#Aa1zpL_kbrptbLmgde^ClkFFbs<+7{Dp$@ zYb{R2LXE0M56A_K!S1%j=&9`3W zvbmE8a2eY?g;C3h^h8$e{gKK;=h57^owQQH=8Br--NEB&ewF*EkTteLR^&Kv>N{lf% z#;m^9KdIYjI#B{=cyaUzg4lF5;Xd}HDdP6zBHWx<(UV8kPP$|o*E*B(t@%$zR3(8c zpqOsgP<_YcG=f(#3RY^W{N$NRX89}exkG%=k*f?7v9-H1ElDpmFmi*|NT?SnIq$EN zP>P9ojlas|k8#8*;|yMxy#h=IoxbobGdMR$Xcsh65upFLA4AmBhd|%;+P|!a6WW?f zn26yKIx;pxs!+`aN(tVHQc_3-ij(-^#F`-{>l7zk0r+srkCh zr)1IH@BlHqprM8R$hsu++Z5k5XqsM(nEbfIZm_xy*=kq!C~gYkNokgv zs?*32qxMJ5kXtQ!L%Q|07A0x89WAyJW|6(&kWu1cgWKdmPaC+w^c*HAp zVB7q$(r#53H!As9z60mb88WZjzIrFn5w~4s(yUES zp+}yhIqRsPdiXhZW+An)Hnz>%rl=$u0@})NPlGAsMc^*LxMd50T}&g3e)@Z!J&?vi z-9c??Wb&@s?fRGqKCbN^yW>1y_c?uma|rp02N5gi9Q`CrBh{Gz1y-?m`tBAn8l{w}w~S7IqiDial|2UNO#Gh5^6JvFX1&#Wb=g>u92 zTxBN7qAviNjbYtELJqyvi2{ecd-jgw{DqWFu9$msrC-_Djnmar9g3HG^A#m4J$|ma zRyM6FjMpuD>E0vU9Lak_LsmL@MitR!^8JqJwlem25ktMYE?1kxaX(U~5~tIRTwSl> zyGH_j^8AOv4Zy!>6lk4U*+}s0r>SkX$#`T7Y^ob$^$vHu*3ee6vtR8Y^B5kZ6tKQ0 zt@A(;TOju32k)2ylocl6(;}{f&Mqx91U(JTDsJJViz>l;ARPt=oi#d?DG0%Cy&PJI zcJi@9oN$=gbK~TC@yFfF+H5Txq45VSla74I(|kerkuQ!|0ST`xB?P@Xh@B;Z;{|X4 zj5KPHpJ4(KMK~M2j#AZ3D&`KhfVc1dU!4;)dv#~I8D|gxDj}?g$uc2BwK@yJ!Fp`_ZjoYuMgO=bQV+=!?__IN|3}alea>nU28_EVthwv7C{vML1QC1u|Wt)mAaP_Ush^ zZ&^Ih^>?wtYRO$%eUoHdc6lnMr?uNtrw0fI$IMW_12On87O4@f5X%^Yqz9)Zi6hu?v1%Y`=f9*x;$X%msw&M8^b1Z@Zkhs-*ML z8yJKQ|J4RIHM)eDbUw(=dQ~&F4&&9DqOKHt;I{b#t7~}frsLkadI2-d^P_<{qmI=% z*zB(yJVGS*vL4`;04R%D3;RtVnPO&7+Z+x!im;ZvN9b%FCfwE*=V91hfme zCICMXYKv9|iJVDsI~O22qEF?PqvzZ(*jC6zecYcwoXv7n)b(ac?t7Nx`IzK%VZ@Ph z85oSD?(1r7?EL%$a1Fk5kUWoBBwNj4hsWF0^2hB;B$nXZU%k7tj=ER|n08l1HI*{n zjGTbO`6Bm14i4^JqNO3cS+&_jsu|63m`_>eR`0gh&Sf$%*2(Lwc(As&()QHsWnWi4 z@n~xgC+|9x$*Fe$W)vp|4mV&2FhhKod+Xqe_9HtZjx)P4~nt}tMIn1X-DVHep zh3O5_5a?}&d-Vbj@u>nll>Jd?r)QJC2^*M7E%2*kEI$Mj!EeTN}yzOdjm2H?Y;uZtA~zL^HL`ZW{y6>M2B@Bj8PaKtS00YkGJ0!q9dD`h#U%L&Xr zCF6KsM)FBN1Z~4v$iys7O%A1S1*;LSXlZf7?A$Sz!0>f|w=sVs3UKFXEc`i@|A3L>btHy#;$ipZ&u5{paTGzxB01 zc?bjF%DC9@Ki?XkEqPCL8^o_FM6_$i8?~e-zcQ8C?3lJ!z>a4)%v4B`KpJ`ChQ5VH z%;1ZhPy=-u>Jx0yz3Q7{x*S1qYp%4kUyxmLi0H%sI6Sui zuO3TyX1X1F<^}LM)bFrr!GWwTmI%72)>xj&(7sMyMel2r8k@x@Z-=jfxHTJh0<6|- zn8?RmQ3xnoo)qg8*bp3cIbPEy7Z7cy#S}D(b!HNyIq^G@O=S@{CzX!0Z?7`mr*;9AeE{)&T(z6uwF^Em~GDg zt|EAlY?ARZQz>i3LcxC2?Nnam^ac1|ddCe;xt!|h$RUgDEYj<=l?x}MD}(JHpv}@d zBrLiEtM*t2X=AQ@LL_C91*#fKO2L8#-J<2k>y_Gm3k{?J3+xa>QjvlnyTgL5K6#NQ z7y8E%-MKk_G$*ay;Vye`)tykk-mnCU2((g;6>e(2D1y}~H#Vf@CWHfZN(;}VllJMm z2Sx+e3hr%Z2HDXzVPoI>>|e1sy_}o1J*=M;0gxH%?Lgd{;$FTYQE_aH+{sv6pQ)M- z7_X9VH>W)m*T=>`WSwh>f~DTeNr4YO#7}LS!!!3+L-Y8!!Xulk4B6o!H z0a0M!A7lu2{X)D|;QRG4zLNX9l+V&huWHGh6Y#e$I@X~GN=-y5x;rcC#xl4Ad1o$z z#%(cVTk(G^;V%h$4`k_tgUqc5?}+kkuGR)Fz@so`^IlAFLqZ*Q7sY}p^i)%6JdjeB z5hz^`B!}+DU3xJ~+VX-LH5IY#I(H~Cr*gfmH>H~gSn(yvfp_Bghs^3Ht-{+)QSjkk zNo(!7coDNlSItcxABPa)U|!=A%~*A@X7WJcIMn&f_72KDnfQ_JRa6$Pc#X?V^XFsy zMkx2p(LykfbiTVg_U2Agg82*$%gd*9uP%j7S7DBW6FK;dTdo?=ZZ2?C&@*zE6&15y zQX6><+%BE;Gd`Ner7`E-9!U<}HX@b$ZR<{{fycD_XrsII2UZcdA#*4N5J~4J`T}8$ zIS+PIHmQkh>xh)>MHA2sJYQLJ%>Q8Jg-Hcs9KN`x_Elx8c%Ga_-5A)mBS~(dRSq z6voDI84t1V@0o-AMFBW4ZB9?S@2hO@S6OloWc1Y=v@Jxevm74CYxmz$T?fCcyGFeP zi|+j*L5#4ogYgy^u`LR0eb++0pZB<^Adzlx)*AaENVsK;T0~=7b`L2=yaZ(N9uYgL z8y!E0${Uci1W}ApEGqx-Zl{)nZ@Hag>DFTz9OrET*E?iawp;An*x&5-J z$XLJBbpI8JZoe*_VVG3*XX!2YkRMH(Z7QBjDSI z0Wj;q$A9oof4ryw2wP;^j{m{eph;w-QI>iG;3$qvmJ~gZER02ST~|U&vw9+H#Dc5n zytZBCg5x!l)RL~#8tFP6ngM9SV?0C+L`_8>$8APUbbTmoy03Iu%!>zcp+cMwe_aW+ zenSI%O7zQcmdz~9+L|-!A2IlDaYVo)^Knc_&aO3}9X>UfRfp7XP|@J5osutIU}6^_ z|L@!?2+@qB0RFiB8-IlG(Wn?DJqi}IT~~|YxBecm*cw8k3-Xcty?lVxEoMi@=kP*1 z;9I|_o5;qd=|@S^-3DUDLogsS}!BknH=0=AJDQ0zsi z1>WPcXEtT%G_4IJHG4(5I5EtrmFuV^Ve6+}iw4 zoT1kH&1mwEWn7cB{MCBRv0Qnct1Dl=Jp9{A_W)*XLC;wG-JfxHIgEYd%?CDgMorZ{ zgAQLy6I~XyER`D_AtnKb;awM)TjWWW%Y9w^j+s2kc|T^&5zQOC>(3x;_co%~Oi?g- z8qF@I*Okcf7x8Mo;oP`*q`YI24?dZ2+>oAa4$vM<-X!k-{*q~b&kpay zY<`t--5D5*dkOs3q16XucLyiTWme0S(8zt<>I}sEx<4>xQ|pI= zXgKc?^PNmxDQIa`u9fUl?A0vDeG@jpRuS6)MrA7$U$Y5A1<%#lh;m-P)Z0gTgG#NhN{Lm`C))! z{+G8nQZA-vmvBQR*ZKHO$2(U1FST}wsN2;D9<{#bTR6d;BgKI~QRg6nNY%8sA^RNn zLc@;t!t{!b^qp>Pm=|N%rgtT=$sXKdX*t_}Z#eo*TTr~}UFJ)ztt$Pw0?u=48gEz_ zXP2NNKY)z_EQ`^fSp#rGMb1yGMi1OlL|CP40-)F7znU9A-nbH~9ThwCkI0EU{H~M> z&4Qa&AuYqD{Xua zHQq9D%yse{;X9B$cL#cEWBjX)@1t$imiNhAgtE?gVQ08!kciy!1K-*37HYQ>y6q`w-5q9&+#tene3Px8rE8ZN-@KBLqSdn3$H#MIg{eLrD zuv?yy+$Y3848gdj=ClLoAG@!Zugt*KS#{bX3`!l63WJlOJ_)N(h*&bRW1F&pB88CXMUvq*FIbe?F~K8jE-!0gtGM&GdoQkL+{oD;t+ zjXm%F`;_iCer6%Vj`kx;@Y<)U^x^{)HSENcUS{8^JzkmPc)O}mG8>8gGurzHk)7`J zo$`~c!nvIQc)LmIIDLvlz`dJ^k$bo~1X0xG%rBW*9RCj6*3eKJ^-oZHJp_1w11Xzd z!GJ*X5Qeb+18NuGy8&Xi{Wrn)J5SHRxAQwYqZUfH@AjMCbpvPTx)>>|S$K^HeU+bd z;c)1f@DP$9*K)Ps{O)(V_R{%-l*g0j3nID*Cbsr?zQVK|j1<|9f)XR8+{iV4TW*KI z2r3~*X>CqV+V$-lDDB`W`Jao8UBc&2CtepyP`Mg=J8hmF_of*s+k*|9`kgGHq;wc+ zcCf#e7B*a2{nfqQ%RAS9-l$_Q$Yh9`EoKm$IN9pr1HYFI^4k5FwWSeEn`tT!9kOEr zgjed9w&%AaEKe+25cw@sOknd&} z#IRP>>C!;850_cb!wCng0H-306s`yDE4E9$zpA*SOgeh0BN*#c4wus7bbrSF+Ky3a zPVrJ%FRkpsaXhKk*Rr%uk>r7n3&e@x76UK!D%O=OxT<&c^TBN`mGO#RD^XX&o)PDm zRS%jvHl?ylNpagndZ?@4R5MFfODA0{o5}Qt=2F!o(TT=Q6=e( zN3MawuC`<7fgs?nCm~N=krqo;&ZPJxkM)F5Wl#Ce)%cLzG{~V@dGpty+?)?*_eT_! zay94*7g*Rm+4vn*5Ez}lLbjtSu`w(*XZr^NKa~VnI9xgQx9s)LCz!1clhs@sSUBDu zjQ`}2d+>I&t;&3H%H^y2(bGg%i;pISoFaaXRZhc397~(xI<;FkZNm$R8$}dv^eFa+ zjZ!J&xm|nwdP~$D*-&2lu`)40-_9Pf|^c-t#ZzzniB?rjcK?_xT2XxVq#jYw5Y?fknUI z`FxyfdkQC2NbRIe=E+%>mCmk)c1rBf_nKO*vqH=HrSvvm!pb&JeBqk{>%wTh`?brz zP~U1J{HpkabS3Ce{oiHKH!(u*TQ0AaPDUzgI=@uPeoaRXgLp|OT#96qFpsfbX~3p1 zg-o$-iBe*cd#$p4n*3&@rMHyUi$LmKXNfp5PgthqHMeDYgsg;EZ$CTanAF>hQb5HM^pBFS%NHUND2(%L@b0bvu`ZPVK=YtS zl6)@D^KSGn#)&XPe4#te$NRS5;g?u)YWMH0JZgM*JL-Jzmf^4O)|(2egK(F_Tz|@H zpDfK%In$~+?%@Ve?5OG>4_^?LJm=dgOOX{iO%dEaWH!`yOAj*dKsM<1{J^s8?`c6W zrk@WO%73O7PKqOjNd%3(6b5ZGsNzsXyIiU)FS2jd3gfR%XqS|h?RJqO_Vr#yRc=&I zj!Ek2lD4=BH~J=cw;petE9R81OORD|)#M*MYV&U!DK*ie2^Wnb_gIWmoX`gLlBmipon@j0k#;o?=s_n4JQ&Q8 zQ*SD|-2zo0sH@dqcMm`&U=A993E7IgRHNL*tgrss^?e6KbIS+kI#Q8HXsgm4=GIqM z?*j|RYP=2kp)lhS$p8LPHD30UbSz!7LW}v?9qK73)b`x=$6?zFASKzyw+SX0s=I$S z!>b4-x5$Cb`0zrjsPQbIZO|DI&ZGhgQ*7AWT|o-B4STyZxO=EW$80bsp$)2h*?-cy zIKS8fsoNfj2$7^>d|CR9Uh&K>AEH(f{`28BJIf>nvV7~v2-zuMS{kh1i_}Jj_jQ_M zTfLf_vDJ@%r(7yJYJt#IZcR)%XnSY$Wtn&9V~b9k{laCfhP-Kk_Qz3b^ppZNmL3L+ z+8TCeeA~N<@*|FmV>@i&4R|8Ymx;Q)=l#x&WHgHd7eOw3Y}x+TgF%_vGiZDpeNoli zS8^nFnEqsR32I6lkF(H+E!aEY)|_7(iOBJ$CqA-2%vDzpFSh{tIj3SDyR!K461SP3 z_Y>!7piDz7rJsz9G&z!$t}k3fKgLpcQPJqP)t3%g4{` zSew`Ey2fo05w7Kpoo6jm2|G{YlyY58e_VU9ZT7j-;dss3a;eERwA?B&gj=tBflNP< z4oCLzF`e?)O{D9=cx(1i0< zGzE_Y*b)YwkZ0`4jHa6a)HZGUPlp}|*BM?H+a4>o(p~PKiNNdT0fqk0#Z=}lSmJ3? z@dWjeAeG?g^W*1JoTkW4ia8ppO}fwc8_PkK#UV?|L0`1zZNoxIJdhn~grlr=%F@AE z*7=>7Tn?-O5kV4tfV0&}DF1_FyQ$0GLwftCK*kcIxm${h-SMCqW?-jI9-BI}dEfHh zlh=7>2LPiZVTCxXkt|dX-i814W8k)&BiG}uS6QE7$63UF#Hy1lX5ZW(`Pyj+vYBf} zVa%#EPCWH+fp%!=rz_?paZ|;AfP=>n+kI48XMd7R`(YHsjJh=5V z90wjvFAn0+)1L)BuUx9%G)RQZd?IE!Gt~3%;z`CYbRx(dv0iDlcKI^5OhtB)s@<;pR1SoqLo^(f%xXZZ14onZxGp;l1L%tTe z3PCsqQfXT=z896J?jO{nuJ>cB(ds>SW6^Fo-G0Owc4o$*v&c?9IW4fp41216osNk3 zlonofUeggL&@xY2(rhqVl7Mlea4za@A}x>-jz&&rnDE7eQbQ2){j9b=NU&`U2@5RY zUWvtUm!z~(lXiJrj&JVjY^tPW-#I299T8}!X*SeQEmxgEu9jC2UkS|x0gsh{U3iY}8Z+o!l~8hQRh{(uaiA8pb&whN zZ8E=PSj)veXn#uHMvbliMzja504vlN(bc=MpYQ3hHGF?wSVGk;;^mkA+?d*seeGRE zZj+-&8n24FnT<~KR6Zh|frydQ1kU1Y$fl7G^W4xA+|Fa>I!$oiAC^RCbw?)y;xv9s z+JrB%`Yt+fNEFb?gXGPxX~(9WOx*YdDny0T;bvy#ky&Q6oC@N-*iG>ioR-I9Qz_H3 zpTNdbzl!GeK(Yz&cU5#CjdSRhD}1YpVGA7!k6CpId8mXART9(4Vlbu*b>I7-p95oK zCZzRonDM3N>(@5IMr%ZhlV<=cRm@3+gU3ZZ?9ZC>`j?5fMuuEPAOs|s>_$DL1s_4{ zTz7kX^iabGtKx&5*eBMUO!xY^+K(j_2(0@vbF1z!DKXzTY5wd(X{a^Asn`Ayf6a0o z@`**YgaaD!orIZzid2|8n4D8BkIgvUq%>@zxn7`4ly5i6=NcKk^YKAx@a?t_9Y9r_ ztN!Bv^%!^cJNfY~hJY9;vue~=uXq|DB24HAwNelRxq2r$q0vuw7*_bavPfMA8j#4b64~$4FY5 zjK+OcPv(5ogQW~@oj{D@RyJgxspIM!%@j8s!SVVS|1`0DV+Ym3qf7fSPK)lxKRb}; zty7dftq?0$63=rFKX)?r6VHVEr(=ayYg5zJEo)5VB8S~skb;gyH5m!AV1EokaKW`_ zdl5g`uHCC+Skn#^s9z^M*>H6=DSmSInw8$#!5&yJX&oyRiAc$Da( zjMBop`!~zecUOm1zYqGBylr;{j*MBo;*o~hiAOQ#FmkZU-fGnucS_`4hX!};moc2y zTzS}Tg7)@G1HMR}qbb}F`)(QL&|v-)aDi_83qDmc03Xlkqq zWWkZmBP^X6&@JvkklZ zLUGm@UVTCRe2{5UU9n>Eeq4-K|0$xpY#S3SV&ZVtYOeHC)hlT>~-(L(7@Pv;!5i9wnXFW z&$|6cjW_r>Uc^_*Mvmj`dWe<+(uY4iDUKLmEE zGG_1&mHW7w#m^I$d5f>ub|RcLVlLCg1eGxkI`hh3WltoMjb6aC?)I@7V`0?7aEw1& zz;YUxwsCcFb3uVzQJsbs&ra;aK&v^@953ucrbmMvlMhcU$2HxtRV!*Qk>}mO{P^qNdbphdA2^?^i*?apMkXIkd0G6U zE&Y{H;zZPi{k;;}aMH?uM`|Dx6F*3{BV)g;UQ)7AQhTmJ0%+w7HVM=uQqnG#5Xkr! ze+a^6d(wbbHFjCgk9>4fM+?szC`LxaEQ7CSVQ>DxWqXybxv3Gayel+CU}+)uc>`lX zXKEeHBK-wA}Ku02v~@4aR--P)|Lkl{Kc`PB7Uj6Rw(@iP-heW_NcRpS8Mt-{gBGpv{3oUyccCE`nL#H9 zObZsKS}juU5Pg*Klw+E>3JQyy0iL##xm%3t++y|-y6R~eiyD8}ZcGp~HR5y&*_O*P z>?XTguRxkS7D?PaAYl6U@*wnA7_Xp+mAs; zGX~WubfgR2$yU-Zh-Q2_{Ex7feiDa9l0-R)QGP7pXd`)?VPz3Cb8F0%!KfSefICnD zKAqcrM8Pkv*q@oUL#s>`z9;NLzL($!Yb3KKz^-Y6seY1tRyxDeHgbexShw@wNUVGr z@o=Uj&5r0T)WW?MavB6?Q}D^?<&o3NF&xMk3MkCF7tI`0 zM6+l?!Yt0GHR?6POA6i#I-zS%vhOVm72S>xQOlA6!Dg#+ks_;$&Mu3Vrz0)tg);{- z4E38$N!hvAx||Mwsh@)Kgf}ial%MOxeJ2HLqz-mR}idv_R{m}Rolnew0DbUW;e9lNeF$p&)=0QmG{^=lr*WG0Dw)Ujk9}iVLQj*s1$A zA&;nHKb&>5(Fc}_WfCqU2np?;DB*0MYJvIUKt3@UKlOke3T`sH*Wa{&I*HoZShxp( z1cob7;#^*v*jR9QQF0d4B^)agSC-rfD`T+d99%*p4M+rbDv-W~AC=YP)cu8Xx`hF3 ziA@gjKO*QVweb8)#Zp0(ipUKt!8pSvt>up@8JKJE(H#Hf3O3%(qlC}(G*WnUVhezn z?x#U(!jbHn7Qh?q`Cc;LigWpr^4P!X;r_$-^8M!*^SdezO@#o|xJ`2dpvJw6@|jqmDH6I^ox?!&D&J2g zRuBpVr|q|YW6O>`;2L`eUMs?3iTIaS8@$2T=mH%q3XBPm<>M9C5nklm|51!b06-@A zc|C2m*+a{Oj<6fbVJwjxs`4&?AZUP!CJoA^N0&Qi!%>@G{<4cdz5$0edQu^3U%9Uge_9f)Z_vUs@e^kAe%uu(IDC( zQJj3epRscIw}pE>xbaZSp{2scT2ok5t)SB~ai!CtuMZn}MxR3G45oj@BSF9l+ z-&gyG#>RD=6NVYE+ZUNrCZ~JN>I@9k;5_olz#?c7Ts>N1J;l~E6hl2yCF7x{%HamZ zVLBVcQT!c}g5Xww8+c$e|6*W^*>@j7XBPGr3blG&WH2315>;~L0B1MDfZXoPg9_+j zKRLFK=<|kKM-@?1N$0ui^Y*7Rz%8J?9wpZgWX6Jt$Tx|T&wD^tvrIaFj)uiyDu&3} zx<4HdV>geMS?7S62*wi}1>eqNqu(TWUHZ@<_fSL)5oHkLZ(trUYl>scF}1* zOK(sfDzlg^vmD|rLlX;HQ2W{KF?Vjb>ux3~DNZ=7|2FZOvgEisH>tg!^cQWu@@LfXS$gpN`APZp=&L9H3Ly@2R4Euju4>G61lwcn)g7TY?Hva1a! zOxbYfSYDRg|15d^q;)e>@>^lsJ>gc>sG=A5BlNFGN#H=F#38kTieM$qJ)t zBL%!U9V2%QXw(#k7lqBnXt+g){~prKN;D$PmWj3&L!v1kYtdXJ$ics|v}pbUb#rV& ze*uO*Wt+EOAq%6SMF37R{}GjWbEEZgSu@4~Hn;%~em>L9H_$+nT>-VDi0&JMmFVw* z4D`s`E-7CuVhjRI$s8FWU%STYK@UeT5ug=_uv!~`B98jS;W|ubH`phV5 z*&fN_`Ida;7g!xy&@S@~5w$u-%%^SD8@KQe<+>*-+!PHy*-WMh14A!Jo?^CqNe-}0 z@Ga4rnc)Gf_&Zc~-j+*UbWPanJcl0Ot?PW@oTNfvp;_}gd9hA$UEKCz$mfrlnv@n6 zOB=ICVC4^sQa-G8uqegVqC5}=33i7=kXw)SgAe=2kNMPwy6nwZ5}wG^IwU{ZVmet~ z)UP}Y0h<}$Lz zYS>@UpK#J@xSg3o)u?4gdM=F3_hVzZxAc)<=oC8N4tKFuhO>)VCgy-Ji^S)59vE^< z7SjVcBe>u5ky?s%fa7pb(sLnYdhbm`@w68`CPECs;6nFb`I|av$zY0QfQR0>)6`*m zAfMe3r2zlpLvEt~1wXhH)2LYBs6YI1))zuqh#DYLgwZY$?U&G(=E@C3qBpE1g0ql$ z3F3k;EF>)RS$})N0yzL6HZe4vx$JcpvM0VM@aj-wyj*49|FqXAMEu^ zUDUjnaaTR_2HX+)CV$`POKwnMT-fzwW1@O(+e~-u_@6WJ3gF0_BSx!*&Io5fFh3&j zKwfA3t33KYb=Hy>nG}9A=fjXcy4`;nDgZ7lK@meQjVvrN2wio_N{vRV1UCa_KM^xw zKFiVXO6&zHqRLVUB65DL=@e-3uL@UE$!?-`(#32r=@*Un82~Pfg~w8*7vk0@}XH z06t(2%_uPy09NvUHz9u%*}s`kbYGAQ2#*80<^6%7K%>o9R%FcxDz>Lki)gCz(o71# z_5-mMHV`cqbU{`FAU}Z(MK2$z6`)j`1J$^sqH{Ooc*Ov$(#IeW#=Qh$CSdLVf~g9_ zq8wUVy=gLY1FB}kZb>!H$8AgLNv4l)OAYA$St$REmjA3f02aMrn(2c>3j0D>^X2K?|cF0Zk?oK(xga zn25ApDMe37Q0e%4Z2|M2;8WhICP0`6GUsBJ(c)lxknnzgN{vbYvW_^@3dH)_M2aLn z-{F*Jnz#?1$K0W(XfOqX!pW1rXsj39!T!WBQ1E9XR*~OwwBdU02wJ|z5vPH!?T{Hz zzJvgB%QeyX>iv;+{`~g{0&v5h{uP%e_Apr*8ja-|oGWF$x(&Jjg}Os=o-}yFtZ8ut zJy}l5vjRaDs7daQRHA@w4rQHR^jq8_P{pwXU^AGPAE1E?n8x313aI#aL-ui|3oo}9 z9UcXRa>4^j|8#r-LmA~+Xwg^3C2M}hsyMSb&wIbeV;{LDFvKJAL2f<7kV2_7iIuml}+{#&aYiZGCw zzN}hhm!Hh?!}tfN9F;;h(FU_i7q|v$j|@KFiLbBsq8nxhalI&=(A^6Wx( zs{xySuG=i&PgyWb6>?h+oJTV>sw>7Pxxd#y(q-egi`&iBsa-5XRaBZ@-mcr$5x;IOu@H)#q zTqz0u{E7xIQVj!~DGG~5LXz)Mm$z!6V9YN6OQ`sQORAZ=VyQ2^-3a7lOTGA(Qp1el zPg)A{IKf|s1(B64(xlXSoEeTy(NM*3gQ5`Q#N;j{M+^upqYU6K$lr09gyk!VFNcK< znY$tQAcDjP#lry2iF^8dqTac!=KR?$tfTq6TW%C8B6Y~^1}0@Dy=-haS=Ymf-4GIZ z0BAyFcuU$FV7cCpU{5Zjpvz&5FyX-Ddq;7G7N7l_ZD|Y+T5ryp#lj^SQn6n2o4H-9 z-4aMbo8tV#()m>hx2dQm-e6^2G05N(0mCZ(x?2)kRO0oe;A_c#WJ|AR1mQ2R+0_g6 z0;j`v@SelEkOBOoX)%BP=|!fa|fd8(qAoKG)b_gAaa`(7{;W09lM~ zGT6?kWP3kAC7g+MJ#{>1Cohdr%yW>W?Y)g=v_;8V$N-6u;kwD}gJEJJ{#;KvC98q- z1BQ+TbXj9{Ww=1My^MZUXyFz<+;Hnn${jI|ne*k$+!5VR7L)5$JDpf;ifT&p_1I^? zhUN2@9awBstX=`;OXfB3Gk~!e1Eo1=Ckn97gajZ;nHo6JaNr!&xJJWO&^tY}PFgG& zO^WeD>~1~;h^|w05kt}xc!!I43E;k;lAfe#(;#hiw7_q*_NXogZ5H0({k6+IbW+Lj zLu4YE^Mef{D zff4&>-U)5qpGZ{R0NsZ|FS<{#t^l|OM&yr|Ervpj&;{-k^I)`qp}NN;h4-S)(isA< zPcmL(16{I0&G5JS;&P2?+F5_(vxL+ocLy>QE-%IIgZtukLwRp(zA{pz9a8MggXmai z`iGYdHa*4nXiBf)b~hc1sC4hXV9@StJ48o_-HNjC1N#(&fDDRf=~Xu!xIKNH-3l}F zP9tFGum!mcx~X%n2)w$J1UtcA4+8sdAZN21fEY4PJ^yOgWU@=Z%fkzgx+lK4d?Zh1 z)bp%lo=qeoHy{N*HxzgtfV9!rH*BZ}Bb#utXEr zX^#hK7DoXq`HDQqd>%Rl3@EUC{^){dw4Fs0PtJA!fp}7w8Iuu5vpxD~K`G7Ex?jrP zpo0rYZ2|}J?AS>%C*krCy0Z^o2i*m-e`ym;k%6(mKSJ-OduMiOmuPhGi7OfA3rLT3g@NT!b4ay2ay_zy!?P!Y| z1f)48VybGv*F;GVMv^J$n`6(p-*+P3G~*QQ$0pagUiTyx_u^0R<~oJAn@a4kAWLDOV^RJ9lRtlI3i z{9<$~$j{;*MEpkr$Tz#aeaJ|0ltCC5=HcC^+i(56*TYBreU;iamKV%M4gOqxcAADo z*to98r(m69`PP=G8w~?shGD$0{IM_X8t(yT)G0DE{pA}z`6BfX!qtNF&Nofe@C!o6!pa4?%2Z zAH~heXecJgzVpE9+q~M|)0~6*2A8$Q)9`i(gSLzEf!7yUxha#7lEh_+l*$~9=d)Vf z&NNS4FKB1&E!3A6JMGC*;Y=?G)EDa)VnqRR0))a}9{Xhrv)I`@B(pN4c>{GQ0WyN?WL7^PoaBHtzf@@p8qe`<52A}pzzL*@k_0;rw zI9aI>1G`;}A*80SeC#X9O1{`N6;o+4TC&fk3BR(2J%W8LUNq_I9Lqa5dEA=&Q43B6 zpUTRRCeMwjDGe6X_S|w%+om!oOd&vz9FVs~_)VsC7uDVUSU?f3-#@o_z6x-r%}cz+ z>F3Sn)tRs{CIj+CWj5COR!$Ix*YdCP_4}|M$b1dZPjXgTuiOwO7Yk1S2U8l$l*XM3 z_DP75hHudou`p;GZe`MZWbH~@^t)}g1U`TnSn;yS(5V{-Kq~Z0p+@brRd`p*15O4~ zU>3wnk~wd0m0u6`Z^om%6Lqzw8&8yM&k&)S=PnY}*6IT@aK{Ug!P=mq5;ji${BUmd z3nTv{177uecJM8*ir)JM^RVI*Na1?;LAe;YyD>!PR<|Kw#WAG#`uKeX?}57UmZ3vv zFwKjQf zC~r&Qj5zjt0pDg^hx+IeEpv|L@a4>+2hdSd;7riAv;dhWuL*+6bOjj(AuA9Bt95jc zU3e)Bc?`uD)Y6}%&Sq#olHL)&SXL%@J_T4SH#k{W z1gncRGBZ;|pbr~622_ZHivFTx)jNMp<6+2~yDTo&v^{g>L|T;gfWWeNvo)5_chZF=ZH()?C+Fy@VOL z&7mOt6AJZ{fdW2@C8~rKfN>e~sTxV~*-OPB2+nyhjp>ufO8H3eL-Lj_20oa#ogkbS zQ9dttExTFEu2o&+ML;|~eflFU8$^tt&bpW?+TA@(MSU5*yYTFwtZp!MsIIpOqRkq} z4L1}^BEY*%>Q+fdzA}&f3gJW!0})g|InD~c5VANEVVgDo3VD&0-@ zBZ$bmFL$9>omW#+nM zqvcE+dX5=)j7~ScO}57|EHOGlS(gdV7-jk{M~&t_)Eqlzb-s^Y-2q85_~CB^MH>hp!cUrn_{hYw^hqRHv-W0Z-vOA3737X?j>EFM5pFJ~)lJHxkH zi`+4|hcwCm7ja)1SJk%myXXc{5s*|sQMz*xqJV;cx}{U3q&pX>fYOL`iiFZ#QlfNs zcZY;@-Z7WDb$iZvfA_ri!~L+fY+|mt#vJ1r&p)1L=+wBQ*sb`5(zI9y8_{(PY`H!^ zl~RQy#9j$j1w6#lLEr9cZ!UpH>$2~T0Hr8eR6DfGrV&-V*GLkxY{?y_>aakW&ky-% zwwp(xUO?Q+n`}P#C!Z#a9ziG;eOZ_W>KKBk-Z$I`?RKd67p~}gf2AU70G8N(>nkh^oDAjH(C;_eXBvsjUmA)l&sw$0~&IU zC(jY>ksP|-%$gqu3Evq37l<|m`a7(4<(gndIp-S2<5X~gFplGFnX3JK6zPTO!TKHd z7Z13ZH9BZnvNyrjqDOVG@)m20WGAvEG2_-WF>r|KZhKh7le^R?LE<}?^_|rk(%u)W zYkojGG(tnbdK0iqgYY!67<#f{HA0%tQ zG4_M=_|a4hDkGPA2! z>V>51YOX`19gM7(CNhtj4_&N7uS|}HH~7*%i~2N5@bleEEp_!sIkhD-1V9PdPyXXT z^Z5Lxj&Q%$+Nb52th;fUi5Sy%0)T)_9inF+ClAos(b>uMDAq70X~RE`gK81I$r_Ij zmA;|uH5d5VP~OdWyBOW*O(mjWTmfAYGH%r>!S)|8y4++!AOkXE@&-w{3BwdJPXjdG z=3PK6=_>C7%BTMom|LI=BR~Q?b3prsvXJCi<{OZ@`gi3%L`ESe9l*46b%4Z0Lq#@Z z1KbqIUXnn~|BqZ7%KdIqNQ3mDNZom@=n`|RP4`_+kdqc=f~=2~2K*xQ2bcI!p96lC z=L*E}boIkRy~w~-88Y)0t{*E!^7vN?xMR_7fkI$5pek9R4x=}D41dW1dD1U|_E+iu zKky=0oJbw7BsS#F=X)6}o|M$TDW{_^zmd_LU_3xs(;o_KP%~y|7pWE~(0M^AwEnR` z;~Km5nbvo1SprC|AU;CsGsG&H+@~eykQ!mPo|lc{F;={LE7#8F`Ch2Y}K2{H=$flaQdGh!>GDjL5Ip>j}{C^<5e+II@AN`+CD2sre zZr7nd8Z@O~Rx57(8WB(#x;Jdh#^%iP^WJ|a{h*kzk)d1dxcz{F=h?d#uPo+4!sCK$ zGW#99AkCQt00hUU1G0hvEQfH3?~NsEQnLKGh9JSZOu1!J3lIqxvHek)EY4%<#F!T6 zwS2gpN35=P*azYU91QWzKWEdgx%-En-_Va8I_2*M8VSus8N`PGiMMIC>2(S?OaTBW zV+=j2&Lp$3@)VI9#}}(m7smk5|HO84=RPtogRI&`cc$5X_5%1ZP!OO+R_;Md4!LT4 zAPVK*i||1JlVu|f34Z5^m~&nb9{|=Y46Nb15v&^MEg$dXB|XOO06qm}h}Qp1{NH`B zPOS>NtW33%c2-@u!-lreVD1&5#i)ADT6F?i*6%0H`IE7PBqq~I>L#7j!;L%OSydc@ zhfycohIDFK1VNDqsgxkwQG|p+jt>w#>GwvhgI+ZK;O~bScK(ey_CFXp{$*8wLC2UP zS2X(cjc5@<{u`E zP%wpBzSOur2Tp{+HJD9iiwLkg)Cz!(!mLMl7$0N4_ert%gOzt{`QNbe{~K!vFlBHS ziva)_A7BU$d{|qy73|kb0P9-=8Z%%P`TpR$`~T*a|8UWONI3^Mr<5N%5s1QKkgfAF zcsT!#Zu(c3${!K?=Scn?T0mHPN$dD^w2aes`7eqGCLX*--Skj{J@6Z3;6eF(kIzt? zg^Yy%%8#9g;-GX-C_OL_0!K#51@bB8zm+rLA3m=?JgAUs``@^X{+N{iLBD{+PKd5M z+@MQq$^nzCMe3qkaTO;ZszETGI8X0$5Mm2V>w(`fF3ZNGCfM~G2QW$Lzjz1f2_ySPY;+kOLJ26NIZ);9E396vJ~ZUE ztbY9d;g=ggtQLpc5Whn8046Y%u#b1pgb;@|jxzy#Pe)5F0c%h(NWS;LwkT#k# zVv^0w!t~>2g<-NqM>*t8OOyhFNbGPkz!p5z79xrMeOYRJ#njoB8;`3x&Bff#?Kq9X z7SIlLSl3=7DHF_{B$Lw7J!EG%`nyEEI;#YT1Fna3d=sPh4TnMhrJ zv1Y>EpbU)B64BMzgJ!ku5T@Y)U;fE)DK}=o>vZ%;38(w8Vx$LcMxa**g8@(xpH0nEI;br@H3PB9D zr1wShs}5YvMDtH~HkfbBp*bbW*)L&TVPQR8y{TX5h1!(ft1fsu-CyBGIsHwjGaI_y z5{w>cf5N49Tp32OV4-Qe0KiEI5k>lB*JRn^6tc4c{G7TV?0W}2PpP@p>5KJf*e0bm zz5oX%p4-dxd(CuD6&6b4w0H$C6jX9+4LL5vtyNzB;?2^jcRW*9-;pIn-jr*8Aniv) zVWIifq;#9AQ$N|i47C_^Q?Pg?h}~XZx~8sLD0byVGR>NaI*T@)sZ*lXayM(DsLxRw z#s;uTNbbyNT9&ONJt#0nbTS!<@wZw_-D#gIgC3j!7QqSWjt~n1?X|_D2e055)qW#g zwv7*S%(WhJ z=#zJ^v|ohcRfny2AogSyUd@gUR^y&3e4);d=x zMKffp&uyYDz_m?t&Tdf0$8-2h_6|@M(lO)X6hxqA9985qjkbzh(^G;M27L%>F7(o@ zjg$l>-P&Y;Y!;Z^)3_c}$G4qF0NNgO0ei-I>3aayMml#9#(-)m3RCu&qih)KN^`~d|~-c$)tSmwDut4E!!+-XV>pg zJ#l6BCKxg%;vc%mZ;lchL+nsL+q;i|8I0N!FIN7C-lXo5fPMyc`cC2-7wOH%KdOYY zYTR_Uf;;bqQNx`EebO0uJxe28ZA~3?h&JnBbfnk4kpV{BO1b%8a)D+vdM&q;9n?rj zU1|XZJ{p~?SX#8?)y@lX_V)l*oa=ogAAtW9z=tiA>r{VdX)p?Ksr!@PAZI5`7(lKa z_Z-mowHK{p;2)_4XjHn2HhCUo{2|w>1)-*DG*v*1L;8C1+m(>!8IVg`4bFA4-vi#e zdMG39m8ye;?fiB~*#poS(D^m?Q|Py_!@X-WM;LhvasQnwyZ2KR z`AN_}(4-|V@XYzRbq^9%Ff7QV$xZwe`d!CF4Co^;LQyJVs9wRwEqFldzLM91X(jnP zXxVTCn(%M#;otUA2vyO%j{fe#{iUOv04)2{WB{bnmRj;(E`tFq7sxRLdBs2>ST6wZ z8uefpeqmE-CTUGbkOU?4c5ckDd$1?`)43(|yK`Tvovc&1)~iu?i7(;I8ILpc zb3lLixGXIc6c}lqn@)+@*lHDGFd4`BZBoE`Uq-|j14>UNfWHKLdOSx<^oV?1!eOURX^j`3fPy!S z%6>GDo@!8+Yy=b=o*vopVg)8)bU&Z?4E$)A4!7`*MQrMfwMR~CE}eos!5)1{Oelav zctglnH$yRNRY0G90*HQALwXT+!D(FR6H;ogCf0!8JK#d$IACjOX_A%_#l2Qv!E>})Qtn3VEl~ns2U*@++ z6srJDsMLfElS5ori@dG!M$9VibSD?8TUgXq$w+$=fV5KpID(Lq3!0Bs9#6%d2T<@0 zvkd@wVx79VYR{SVWWyqDvvg?#~L3snJVX(5?`PS#|%b9~_4=KhVd8)&olg1C7c9QdDq z>q)7d0i}lSfFN`c(?*)i86}XNTYjI5Mi@=JPuATc;_QZ-+EKldF$4MEmAIXq z+6AeDg4njM2{8)i8WgO?&LoKxsJPu(+0_wrnYx6EX7aYm$NI_kVwauU1n^umLXBoX ztJ%WUvW0l7PH`-ibR72pW0a)LBPmMD+qD^2;YX_lRv;_8*AWk2`vBj+zkNlap{~mY zXt~WIN2|I=7pl)r)>(4l-&(m?3^8VJi^WnSwjUf-f{I^N$khQ%VR860BBP%)_$!kz zVs~IUooE~6)fg-aZhg9s?R2skQxVdd;u-G@;9rNKV-ZE67g2W#7R4<&2~z#&q9J*RlqvkNqPZgS+lEzVD5N!2)-MJS0$f0*^OH9i02q0d_r;5iU^6N6xP8 zUd2As$|o(U_5wvU{j7_05XiW=!%l#~qN^tpj#z<5ryD+^jg296YxoJ;#hYg*6K6w1<&jCaShfv; zR4g?gftP1Zc6KIENX~BD(I5&?#>Z>sZUqG@9B3p39vr%RKmjX8{lrVz220}`vw2Z$S8N8kGDQS>R!?f{V#9b{G&d?`k53ngWZm2Xpq^z%@q-Y zSWEAZ$6GB{Ur3Y&C)$87FW0vNbd@vrSiO-J`vk=Sv~XMVSNl9yb@a6$m!cpXC0oe; ztBKAsOUdXHgELK=t~+*^_CWf@i++Q_j#w)i7%u&U5;XM1{f+x!#%WuUP}8G_XlHU#ByzK4>#XpAcHfEVw7esZWt$H*91Fo7Xx! zR5C^z5I)_X;e7HLJ40xa%k>C!f5=;E*xMlAVzQRxT$Y->;fi;3jmBl8_R=jai&_j` zd2r$}GiX*67cjvD%5_06>)e2m9JiB|K|*pRcfXG^o?s7~y-oIyU9pDdg@Hh!UKC1( z3-mK86d8J;TU8&Sq2PR^E@0QW(rR>pnIa6?@rehjME=pp{6|q8 za(_gUt#Kzq^GW&ik(fD+{MIt%l;LQBImb=2WY-xTJad@3#ik^xZ2c=0JM%$SnW9QW}wL~_BiT$EI-Ijcr#6yBe$hY1BtTO723;A#ogLlSuL7LCwTFtSvd<#{aRQ1 zuX2s__nR+1I-X;5yTR#RvIvZ~tek|#BiG)_Mh@lO!R)>iHB`8(>A2N&-H)|$yZfyV zwNrXyD?9701oWIosj_(k6K-cGyUvYL$Kg>7>NbZLH022{Y~)|}1Dl@G z)9`dwe)A+Rm1pJHo1869q*un%g<4nh-vUErQe%zN5d|=LScZ7C-hmd4pTcP;o48gR zZ0su9m><4j@j8ogZepYR_S=cm-I86$^SkFP0#K^%=<7R#pw#iYHp-*9n4Tt{61k4x zV={6k7y7ljw%(DZ*uqjJ%hh~3S<UZE@1Ey3(w1ka~(?P7mlZH{~Z;eZIq#6;v9APD{xFHP6;3!;J>Nxybyb!{n(w zxW(^Y4v?0?#l+&G2;HgX+5z*QA4v!^pf?CR_b6&8_9I0_%7(+h32`|s>vMi zk|8lKi1m;!vd=FifkS8))E1Vsd`oiCa+rGYq$W#y{wn~hYqB?HI*Vo4BsgZ=3lQAQ zwgE%!nCippNy5=sHbNDoTu<^0)fLTqQQb4;ke@+39idyj^NIfiVfc{Kw>CagH<}S=?fwFf zVnsG)seri}FH=!!=T(l3GaVl){h5~xWdyM$T<^II=#yKgDQzkAFVh8f;>tXukTx~N zw0%yFtsQYmid^jDxWi09D0x$)kqK=@A7P6Fo!MfBjwGf2Cl;0b+xV@yEy@yTy&9ba zEvmXXYo)dP$4@%I=`mb5y#C)lf1L^-pJT1%Ve!R^zmYd#H#%A8jZ(UM`WgQeszr4f z6jSZguP}T@n~+tK8;ckPv)Hx%^sA+W3UB8nZ7mxYZwA$_Gf65xeq9FaFr!{aiq`mu z$oDAs^P~h7Rm9?gv|&Y+m`sl+$*GGwu^JR%$}doDF%+MO9X}xbbZvP6u;7CoxjQJ_ z-tf%G2hufMSc;Fu;4k=fqwW>r>=SI~Y>eavaVLoea=&%`(oEqiit2hnjXaT1LY0G# ze~=KidHZtW3zS#8*1)8ark@y~ae`ePuzS&YT(`(S9p!TJSIMJSkC``)a$kttbQOvW z!Pn^06sP%&<>&~{V~~3q@ooqoJztf!4YPExkhVji zK8d^=N6nCQ{o^;e+J3xR|EIGQ(|+Z+Wc9elQB9* zr>?)CVbGBT=dQ#b-vUn4niRC)u=-{xC~4Dy@ckU+71qLlLRk24mZXLH@e!z0e4=b~({g<~b=xGD%%B77@9t1wzHP4K}9G#HI7~&UkWYCpH&xt>vpZ2VnW=(&5nDSXe z9py{FC+t_RC{S^+(@RlNJ!#F9_1=hzsm##?mrs0KG@gjxbNR~2H5U5%(fT7E`}Wp2 zNBO;sBb(}@6X7va8fQK~WkK?f+}AqUG6urS2B=lH#*5|=8(W|{g4s0ZBTDUXCN5(U zmKZ_s{=4h&_oI=VS&O;(eI9md6?PLx<0XYliJ5lMrlymH<)4xa4UYS$?=xM!cy87E zQjU=c<&~50?SXLP3za$WUbmtAB+azr8PboF7#Qx)T1*lWD+lCrn+(FVh#Clb`Uwx^ z`J^ASz)Vh>bko*Q?aFWz&rqMsXJ}*^Cd<#uybKunR73FaP<^eM6ExKmk#lTH=Io;B z^y?pXF)dmz=V+bc3uB5K^_`n%U+eYL+!?c@`;x>5fSWRNt^rp!HOQ(thcy*6eQ z-%4VYc}2&hbYdY7@|l z;Dp9plfh`pL97|$Nn>JTgbs-3pAfalVT2ysAfcCUp^oVAgw^De=b^T0f0$&QT;hVE z=ib29GWZKNKzq~Kla`iuutFwSYd4PGBqp>E3%^%ETLphRbk2q^#*~6qgat!&*Eltx@&%K0xc^<)@Rdq*JQn?On>j7nq0jFwZTzwN3SZl}GWLD7k#LT!}O}cw~JdUU9n? z5$y&d-=gN~$<~rZB!o;zd~ri}7=|^Ex{OymZ1|MAyeXkzAaDtmbf$!^D;Gsdi{EW`S+h0vN zcrwSF@3nrZ@12^XZZh7{y-dlfZ|?VAd}TQ3HFHl*zB3X3Ql#Jn&l`JD(5>PqKAjHU z&|9;hNob7v^=$*U_6=vp`?W#&QeE6J8Op<7>WAfh9wGFKNKfhU@mp4SrX4jQp1P^4)`zZTYipuNyV4lP5T-tMv63VQfnU@|zcD zS&t+*6UXJ;ceHWGX=Dp#?~QPaPkZ5LQ_6~;bSe(5+k0Nc>M0H>hwkJ3^(Zo;3uc`Z<7HLU!l8?Y(1x-)J} z7?i(!CquN<$+u0}Fr;l=jx(nnUF*!ArLVAiXK%dl;&!b~vh>i(*x7-+O{Ag}qiLXB zT~fPTh5)5uOrb!ZR;*$w z8<(Rh+ag3G%QqMXpMS%^vi;yt@~Brn*mh<6C=N$4;$)$yq5JbZcfrx5kK4WvQEE&O zVd21dg1!Qcto!cd3pN*>A*JZ5-NFcilfA;q+js5PI^32bT$dQB4%rbAuFBP4JA;-) zmy$=fv$o!{MHTzv_FHiW|NcT7bmKkfkYynbNWwxf0;y{fIsZvKfx*=~J^)a;A(!HgweW$p=+auxlF zCoB^bA%SIso$*H1PK2gucIgX|egr20Re1|!A*4?EJ=XrYdGAWgtCEx+%?KhKJ2*YU<^zLPFO4RI?+HhSTZwG(?0iA(AaWk8BTQA1m0Bx>#kqgu z9$D0mhASRN7!?oHatp0YnxD>_;~EZNVV}pVnOJw-6N=QQBMHu2i_;e-sS#>xWbv@{ zSTIstt?f8&?Fc)>d}`wSS1`UTUhKfSVlXqm$eOu|_6->7xoSXGupwZgDCm*lmV~q_ z3Ne>pUX!o9&hY6)_iKqng3Uw(U|$kQC`mK1(#33wc9oL5C3IJclZmQ+I?L`i8h# zCmNw>h90T^K0??F2CFzm#b|a^EkD*CmZ*4F@ySJ(A9nU2$muT++-7ghBg!KD-2<}| zXn~yxCVns)*(u{=qpMlMrdZbToK-xu#%eMvw5%h@ZH!QItx4SlD!RjMl+RYla1_ck z5@y0w@+JnaCo(;&6R6BPJoOmELq@#TMndj)O~dO9(uS(%^yy%VM``9LboK$*H#y@9$C?e>f zRaLZEQ_AZ$B0(dm#=d9owr4Lxk%Q=4D19&GmVEYsHVTnAVlKE{RJ3kE7kMgIo;Ear z6+=6;H;NCsD(^_RrcoDm+RHMTUp&{LkJ02RL-kN` zZmy~im$z|>E|^%stc+H89$&*sWacYa3zat3f(yoB8t4h6l^0HSiQ4f;VQP&XvHRz8 zU~JygWG*UP=eSBmXCfQm+;of86rmit-I$W4mKBfLqDNjp^8DnYRZmlKIzP9DiOW@XJcRcW?^C5>P|~3BZ#R; z`2gt#brV_kL}sPo9zI#Z2k&)lhx&Bsg}!A83`U}IeA&yjnW!?|@^#tpICc$+H(~54 zH(86quxQ0f71L1Z&NayJAYO{}3%C)i7OuBy1a`RLc++rEkyPnswPe%52rJyH5pxVI z>qcd=rvdO#t>bCZoNEfoyMGnVCLNmIFqT0KC)?wmi$J)t;Fua2w>ZhU>!i@+&c0{3 ze?74@n$}=*WU#R0o!+}OKjGSizKesbGG0y{40HKQn6dsU1Xm z(&?G&v;cA2-ZMe(eeVr^tz&!{d({s^`R{kXgvqyf%N1nbtJY)Fah*qZY_`hS?F^HY zaX6%|9=1x`^()@XpOavDS$=z;?QXQ=h8r_+woCQbl`6BH=huakn0cPPS87oj zF(@d;xsb_tC{q9r7<$CId!y(bBTK5b%23@SkvBfuJ)SHrL2?DPiWf2kp3xaTE&Hn_ zT-B_%J9*3a?i$X;u-&JqFg^0WWb}JpitL_ylf9JcmA{_8(r3@&_3W;VyiZD%Qk)UyiHo(5aYK zQ}$e>y+-js(G&7#LBmxAIb;2D&#-nL3|mNlf{PB9L?h03BUe%Gzj&A}jkA(cK z!y>Bc`Z0%fP67rF{dZEbOjX3(T6|?@Y$X1=VW3AxaER|CEIV1sT(ZW;LaG^29Ij>b zxHx=+_v@qj3xk^XgR7Y3v#WW-c`lK3Y&jpA8TBW8wOV9)WBxV7B|UnxXvJp0_^F1q zYoHWZDCy1E@Is#9UF!Q%pUn@Rx*Q6ug~*aRS{w2e2UrJmN{pnM4iB)snJ4C$1e;a( zat<4C+N+GLkcC8mZxQqQQ|+w4b%Vfeqa!hq$Tn`Lj!>VH=L>iw<>mLG1o+6ONarr0yVKV}Qq}#hp@JWYB#I_)@tIqQP^roy#uuZYVf&)#Qm`UiXzuM+7CH9F#h`pVB zRi)fXzKB>Y%$d57u>biyK3XV7dRml1hEdZjD!KSmu`srW=I)XD9$BeFjZ8JKJFeB> z;9_UWx42|ee~WAgs|gXD=Ua^tZbN5vUN&=n-NEgb^{#935+f{?AaW>H|51)VLn?!q zOs-S()MM1i9;-|YU)~ zpHC9(62!76tyqA*>Jh$IxsFJAi$X-rCwp`WZF@KLVrhatGwGyI+hrycKAKdf4I!2i z4VW)&ECuT7^?T-e(o$z7ZmIbpq8`r9bUn4ysjg|`CZDJ)v|k>(*CxJnLLr2~wuIYm zu~rkXGjgqa)YsoM!I8%4CNF?>?*?898(LVzNOQF&@&Td zI@4F^G@dr@z6UV%@IkJNYRcQub17O6H4o)$l#kZf6+0*Hk6g-3jcS^!=Rz>JXLl%E z-|y^sICm!k*Kw&$%fgx?J(Fr%U~Y9jgTnk{p8M7oo!aSC0^`IXg_`f&dQ zw$|wcsdb)?e>`p3o8a<0ujBcx0QgTU=;GKky5reRSYVnt>78T>vMc3>lqwXdYilp* zjAe$TgI$!-Af!cjP1=m5g>`dvH_L(dYW6rqHQcqkISTj^dt$U#z~$D}GBg`jNAeIt!UESMgZd@brE@iryY1>xWlu3lu%}9fCR-g= zo&9rbb?gi@gL+KemsFRUTGUM}BHze&jJ>jMk}($?eg0_=rc+!g|$Ial@;8ds$!*{BL*weon8>7f2%os;a-JDoX zg$3^+`vxsf1f_hm| zeqwfS{)v|ed{S4=ajEaGG0U}bGpE2|!m&`^**g`bau_wOuut?b zS+qR=Av#qWLkwQ*JFFP}R@Yimm)hn4?uyUih3{+L7s?;;(6oKpp)ezhF2-Oqoi2on z8|kK1PWT}lyFg>mwOH=U9gzV9k#Zw+neMiH1Eo`vn%mafIs+!FD>>CwE^eyRjy$Fl z3d8ma;g}Evac zB07xL(^9fpTDN>ewv#Y4TM7yfE7ud;tw8E05uFN;XfZJ<2psx0Zs1aH5cY;Wm4)=) z;o>ykPQj`no3KC6Hcj*RJ!-S7j}lrjC*~~1Ru*=2j;NcJCd#v_El1Z|zhIh1T$K9C zLMlRiU!4a0q+dd)80_AdEQK8KB?Z1hXX{rP)l2uyZ<^jPHIFolk1d7`-kT?Oda4e; z2fvx!{tQrXL%YN5a$NI^cctq4WH=~Kx_o-LW|nyvS@lL`*!6M1?DH?jM zmxf6gE}}JonAiQ0hmq0n9$82rG4J;4WhHs43%w8coIH2Br&Naqt~?~P&b?xBgNnAj zN_j|B}7aSPbcsMca;{ z;3sd6*xf>d8;VPv)!X~5)_GSHYt9x?R?d_hiEkHjdXRtW5vkF;gSqzl4NYdGww-w# z4-<{qbY3hM4Mi(DLDuwSYPSJ=MP*JO7vV!fo!K~frhgJ0GEcdI{)f)r=pF@3A_*rGhc%Y(3Fun;2wP<#MdGNK|xke|iua zpC)>;l#+#MXNrd6BziQ|!K(0>D(ywomuc0>%$W3I0soaW_&YrGk|HsUTO*}9!v^Js zSiR8)%rvfAhpqc|e+8TD=4RmzCKlPKX?zaH7>4n2RBpY=Q9l`9KhZ=vXhxC7>;Y+s zjl>%eV_E`Z=1d+FtEN*-a?@tYjtM8a?i287MA>Gqp-qh`&)ORu9S&4i8MY8cR<|VK z8+|?-+*1&FLU?p|=xWc(Te$S(1zdZqXr(K;fsw*yjog0mI-E1x)xoxvBb$(3ZK>gk z-0b{oQ^JAF-SRY7U%HG8Yu}6(>yAjP+oRkwjd}B%XrC#ZrW@&+@3`f#t-YtAGucRa zH&sU>6`0bc8Q>cen0qBVJK-b4Ok-K0Z3gK=tWhUF3~oPf#c6vU{sJ%om$^x*8+52c zNCQ8#S6qj2eDcP;KeWr=3W&B(-cDZ^yKL^tSaeFH4P7AP6eAh@YI#mz%w!us9q?o_@`^ z^&^fE>s*~xbV<;A#wUf~qM>MS5P~TBnW&S?2{8!1ZX>S=#idTx)c2%ao47MbJF{8@ zR?yl$70FhmgtUFdK=W_Q3fE51c8V;^h-G2Q>Oj*h8q;4^wVL3+Ovba)A3Bgtyg{1V zSA|xoU*`FySO#XEHJA&r!uUMFtM+ZQcD_FD6WZ?)P9BN}dGfg``SPw?uUpK?Ta*!A zK_=nbd-4U=G@HIsVVeZ$W|y*}Du-QEJHb(dciLzb*;?CE14IbkRhp?*>cmJOau;&R zCtMTY3bG@o^?X%%izX`=yYA+fz4mM!mHCXMlgXR3H!j3)JkNgtsssZt+BOtL67ynQ z;i4NJM9ebkLVHxahEG%TB?OsuO(pYFvcwUcXAF;~`3?yi@n< zW?tOl4uz`i3oUQ6^lAbwnA)P;UbpIebg-MbA*Q>NLwc#T)|;ceICr6*(o!-=Q<)u$ zqF#r4=_OrIr-WK^JctcfT{GMs$`@50QPXwjnMtT+XxJ-{)qEfsp*8Mj8kk`rVBUuRHnv*#`$Q>EJ^c8uz{*ZV@35Fq9zHlUDob%8x#Az0!y?Llt$cY);S7T$!$Lm$3RD8PZJSg3p3 zr$lBUj<(FmZCEAeLhUX#1)vRX0A}htD|L=pE_jrioFH_qz&JeA{j-V>1y1!QMRgwf zmjzTg@ke<|4XhEn3nV zND|pgGt4!>{KWZOjLJ>^b#K~@i@o`5xwJfk=?MErzIGjv1FPyD#_Z?C23VaECUkf+ z&d^>A6LV>?R1&2}$5pf?5uiQP`W8J9Rv1OFc5WnZFL$Wd>}hAvqCE2^zWe2M7;E>9 zN;AgdAg)f7=fg78vPIRx!I_mH&b8EV7@A_{3d;G>oNdyu<&rZ`*h%KwygFGVT52CK zAS&E-7@eV3n#}De^|+gG&}tyU<^Gg!8+AnAYee5Su(`{sdkcf!xlzbe+~`OMtx5lZ zaY5LMpfskdn1izflX>SLzTVvQJflw*=O}(nDtXUdSqDOKw+pi#Be@Th-CmP&N`It( zbKT091h9Uu-h=A2AER`hUJAZV0RHzHa?rwm2nur*GQt%IruYEgjJ9imsZ(lgzr--Wd_Tt5nu`A+%+E;7D5_k0CFL*_24b*jR#P zw^o-S;@21X;6od)wX*XnqCg2I`x<4HWcBP&j_b|XV#XL+6w_ZkD@rkQ*7`oErLJ|Q zGaoXtZ}N4=)Z#10+{cqMvnG=t9vL@B?GIP9+oy8PQK{F*jb2){PocmLS3H2JvD{fl z?DWYRE}{6aFl1%%trS1o1bb@*>|p|$UL_kX!N=xrb=14Lao5oI?@wUa6{YB7_e^+= zYx9jU_YibOZ#sk1Cc}lh<)WG@_ExQBi4J!v0ZZ-{K#=O#g!)cqPd_G7~T>U04qg40u46gG=OFW^pfjHXQ5X$4ZZSCyJCH7==O zK4?WN#WdT4!*t!V3|6^FDYUC5^)S918FEH^OkbIl>L6nlx5`BMIX$3%f%1A?I#5sO zY9RP%&p><3G2|Cc2+F=r@WsQJf%4r zY-q=%mpRMO-1$%ts(H$`mawCeV7m=EQcFGjeh<)n{Cok*6D$v)4@&`ElLr*&?|dgj z7P|ZMWIttqexZTj^txutLiGOg;q-TdDe>%|qk&%F&jjDrjbPBPP_npTHjka1?y<>y z^ziyc(*py3PmfCb?>s$px4&Ma+tR_Kg%+U3;QL5XO4~g;SwA~nk2ndKN$Wsh(8siy z1WlB+N?(d~J^hy8X7(h=u_3-|;xKE%X*=QHe;Vjhf!jr15gLCE2J5e)7u zczd8ueoSfrL=K=7ETn1%h^fW_lFjWJ*?b6e^-SG=gqEKZ>HELGO=ReiP-2DF{&jSL z0?Wi&9avQ}oX5Y#7oap13p=bkPgLCRZW;fy%aaX4%ZJZX)}FKJ;H!UmHH1ztA7WPd zkeuy8Gj>k$bD#6f5SSrR&y{dOy&&=#y!f`2ANoYl3S9b%y${A6h%>~zKnq#ECLL%C zW?~PBSUsOq{`K;y;4M>M6#RVFGh$Z9pNDeh^RI9C0>&zEE)EQlj*DsiDT$Ig@mXFS z(?DzkLxg&MUCFf6!V_>+<_H1LSt6H7G^vq`>h@!{89|5XL9bqlOuZ>qh;af{A_*HW z3nrkM14efTNtp)&c4FOEdE2+t$y`_ngwh`CcJRsWS_RFfH-u8cysk8unmdm~Hc@G^ zya$InP~y3={;waw?<<2$3wRr%k*ZJ(opj)h(bLCZL>`0oo?uE#X+m^9xVOF#;=Dx= zr#mcoBZQhjwdRv1@}>uYu_50l1u>x*_yeX!KgEIH@6NOr$sel~B;5~! zh%I-<9qhoTzcvL%kyff*I?xK-4FNp7cPapiT@ARLU?PIkr~ZvWwhk( z?{WI~?|q*qU{!cSLt_8k7{C_~60J8&0?_74FF+@-45Y&v?ZyzUP@)!oV0)dok?&vlqZh6O;!gFGHVO#tIs!e5xrJ7`pT&FsHt623E`? z#E^U$4K}&&$>Q%f04Cc%L*uV!`g78K|2OX)L;(JMXV5$ToJ&9B9nhJQDZxhRO)ay$ zA;}MIJs6O&$GwKmWZUjz7?}HcuHa6fyZL{(6XYGrs7b+*`4{l^Ww}UwehZ{D5X%{m!72ZJ%~oVvyFynKa5rwDnBghXY0 zJdQ9()pMaA15Bv*I(!6}prN01FtE{e*NXpDTv9Q%X4jwOs&iXVa`2q6#dhx_!fD{Y zD=7!%Y{t8GuHl5d?chlAxN5a(c2WxcjTWhnW?bv07GE2S$f5?wdCl@6F2Vk;&+hWr zx~sj10+B-vnj)xn(KtBKFo_MTA?D?8Eh%d=Ef0S^Ao^Z6wg57*OtNQCa1ZT8%4jcoItq6fZa`Rci|m_FSr{pMO$!|DJv2ibh5D36g6aO}7D`)O12W*Twv{ z;Jmg(tk{tyD5qG>^07>;vYMzGChGX&y-9yI!)jk9v&?a$W?&)1Sbh&a|FD>@J1L%G zWc;Areq1T=e9WT`r{gkW%TIvpId~#|9E1SE2)I4d!-oLsbK54P+}~e;&=tp3IsQ+Q z^?kZ5jipY#x~!Hv#2i(`yqww8D}cyKrl$E?>U6wQr?hZCZj@NMf68&&{V7#|Tuy^{ ztRDdt^>sj5f#H#SC*hN(vx>1&xg&9z65eV3oPa9+%?m#MUfddW)sGn6rZi*G3QmT5IWC<_I4@+{dNVM)DmR|D65y{u{dPE41_k@Etz_ zy6dg*B z#@4facSR)M=@Rn#Pf5BzJ&_4oN8M_x^;SkXs|z4KqDEeX3~#H?x%QBH{XvIe-WH>_Ee`cG3*R84XP7{PqJg9;1%OMcgF*jKRwG->kbXeH{>0gmp6mlT#S6|^N6iGHOU## z?!mQXHXc@YS|`^+M2RG;>DaOo_zy_wphy62C>>~cR69@zq6Qc5sT;g|Qo1{AJQCv^ zi=(|g`jmyLJ`9f1OUw%+TWbKEe1Q+Hj}|$U+sxh>eJk|{Vp2eXMP&(f_Z6{m>(@K# zCcchjGU~Q{kDXVp$Z3yuegEDRS&N3_+?x_7c(3i|>v)bxWsl-qgBL!?nYN=|F<$V= z%eXxmNix_4UxQoD5<8We^@Sj1#U1(VvUrLI;GvGyS0A@jXO4qvi-_G-5dcA#lZX(D zil!sw0a;ZRtv=xwrKpMh`op(gAnAobq{awEJi_v_vg_IL&h!nEPl1b#4--Iu@VgFg zXayQWoavt)O4h52^M4`c8h|Zh=+?`n`c28FD2)=z(%&DET3?04a~&c}>s}-3n}I z4*&{sk_ck3b)chk2gt$Ge-SkDkh6ReUZODVc_FMnhKyam&M?+z{G>bm`{1&^iYht2X;=Bm5s3 znMOB2*Wu$!9dxuCNPbM)VP5MbGUEw!JS=zq)^q1pZG#e4$-`hVXA z^nMWH!RL=C{{J^^!1MdBrVU}-jOgi%Xf~`F>}{?Y&VaF-O&b|X9694w^eMqgzcFf0cANOT64#w!MJ>wtp#2?ENbtTJhVFO!&R9h-u##s~zu z9k9^3L&W%F zU;G`F{|h1qneX&6hz^-Fpi*Lm0#r(DQ4#z7UMT^h9pl~KqR9oYN6vRo0pAZcMI{xR zE!a;5!~fiie|yjWII8JBo3`LFfouO@X#UB>A*nDVMk+s9Gp7OMOa{`f6p0}n|CvAi zFk_oKLc>l4VZ31A#TnDVw1^;@FjxpLPI_w*4N^9tuQ6}lzCfZVi;NPKc*dA`yurPQ zw^&#YG|lWQM#kRisqO79zSpHQu{hTO`YiT=_S@wk)2j7G+}ni%9HMC^nD*OL`Bcp-JElEZtQbr{d zj&ZAyj1sadD~^%Pu?{K?D;XgxGkedhloPVY;TXqW#|r1*aQMBiqwet;-^cg!{r-Nx zKXl)X2iLi-_w|0ip5yg=iA~6RK?2%OcmyQnYkhgql-O}qmx`{r53hwV%0bsAtFuYUoy@Pk}+|ACOuqI~C)7iK@= ztk#OAbMlXf4JJFHD~Fwq00%_}(1_c;kB~qk7_8 zeTW?gzbf>NfOhY;ec5*|wQ+7_NZm}XR|(oqkK37 zHzBbrkI(Bv07&?xCiRCnZZtg*t4#;SF9LK14AJFD3SDOf>*SbX&D7NSz|P8mdvW&a zq2FtL<0!t+v%mLN&!Dpc;%~kes0md7)m#B;C2-;+pb~M9dHxB0QvLJa674`_L5cYK zRp#26DAThL+!5yu&ZAVckjW{ZiiX|0y4?hE&!>dPj=`c25N2(2!5q3Od@YPUxq|OicYEhza>vkDMtRK9G|V zD!)M4B}Fv?cyH*$BD1Z=5aRNR`4e$*Uu(J(;Q}T%sf2-Y9q2se66};ufYJh7Fk2)- zX^kgMg~xZ4$M?B2GAn1A4~-dEU1z~$f|Df$Fpg`E|FhlvAIfYp(Z5&A|2rBJDoTxk zwe(-0t$G1pFr~->ALR1M7$24R>q+Y9?eM@Jqqx!;FMpPK@~Re)Xur2&iSbqFAJUxCme zRJ8>e+6}@G1pDLvTGoPqKSXu1(u4YV&bu@7h`I+EF+Tmz5#uk6@a~MFS3Uc19%5G6 zNcX7a=ej`g;p5DnkUVwg9ryx>?Qh_1!}|X(c#F7livK5u1P~j~03L1TnLV`BzlRFI z-@aW2>9iqW+5WW_`$@|95!#1n| zn&X^??8%|zjSGm9wDLiNXEyT!KvU_Q`fh7V{lHnEk-}$VcLs6S^Q56-Tub#ohfK$l zra6CAh|nti2R0pg2cQ>ue1-)zbY@*qI?UvY^r16WK=8=;FlnO1C`%@|Pa=UPA`0{m zaK+Ib6n_GMpO^Ai`T;CfvRX(D=zmE01V3@&-_YmKRRHcL zYmdA6Ujj*JSKE6CY;lCfU+K! zW`{j+R6ueJnAOduwLg@}DcyUIme|)=yr$~S3CToYTWxs@2~_Eb3c3ZKVk zzAR2+yMK-l7?Ri{B=(3AlX+xhO`eOWuG9t!Va%Z=;E>*Pb+CB-=Qs4P`|&>;1AZ>E zlx#@k1$RE=45%O=l8Fo)_d4NNOn|-U;68$mD%~`J(WR_@4umS{8GsBT@aXdo=|ohZ zkGx3+&jvau_ZkiU%1->3JltP*)i(FxLPWFP@ZF=t1UlGbClVfwx*or5`DI@=cnX7% zmQL>UI~7@C^Y2VG;83`Bl*e`^Kba8_a{_~aFocYLH*HbML*Om?pM&&2R%QQ5xdKPxx-NEE*X>2R0+K}jv7XzU8e_KVIwqI`=zY{+A^CG(*0r@ZK)&HnZN@)Ya# z`h+I?^exVnMFRO-efC*S&1FfTF2NiJt=69j~WdgBFS$6IxGACN0=LJKY@U0QnCuPV3HE z_)UdI4m-C04iilu(heTJlCB0kt;g$^W@cG8gh6JTR^OAVxg-cU9S*o79uX4ReWo}0 z2!eiMc+t7E@+*!%UOi^mp(?LqT;r~G$TFy1NP=-eN8H#R`f(1>DjP3YSyJw zhb^3A$7tYrK{9*83)SEK8lqLqz9GBp;hG|aJ$NGEKczcMi33Zv>82XjN1MvmB&hDvHkif| zHteC+@DjH2uGmE$C3i0J%}nOZK5@L>LdMFo)1i#uo;zU3zVaa+r{v%FdPi$3A8Pi4 z7DTm0TOg}L<7>y`B5MKHz8D2e0@)yef477W{PiiW1|ql#@I)^Q6vE~@n`X(JWt;0= z3550IV2e<@iFzVfQMbWk=wA!Vwg*mE^ee};FK;Gq3{=}@$u^#2MhRN`d0!BQA-+S3 z=%qBkJqko~RspR@U#Kfd#^J>o;NVgCYPa~Z&hO>|5C@yfcL!HEVpDHcr`-WUou4ve z2SmRJI(RRlZcf}lWhTw;w-3xa^)A!Oo$4O+0{_LS>yKs*X@rH2<7FQ0XH;cz?PNwC zf9NM-e7H*_q!DanB(TZGkX4?~I)4>?x}mKNI^I;6jQQ~67%*S2UYK#9F z427s=$?Ca6lBq3>yq3BhtVe;nCBKw2=UOml-&N42d@&l_Nrj28PMJY#d&W}x76Y%% zNq00^5e0rx&;|OTIK>0Vr(58gT>ryFTJ<(FRWnR^uozVu7IY1K6Y!S+eE@U9W>^m3 z%zyKD_?D;yuuwBFx^Tn+&h6oz(~HYeXE$hO`KIG1vhIFGDMTOM0GWdRJM85HK)$EG zd6U!d~2_P~3q)Vq?3jdLZ$KZ|o2Nf*X`)|o6ClcgW_L}(pbfUjrd zdh+jU=)Y0A1D+X-zh!|+I%a7*!oXhPH3|FE zkx4GRYFhT{`(Mt+H_CeC(q)fn)(Qd3Q0VZ6AJ}Z>Qe%Ksew<_JK5UX-O$WB~BUqSm zcduIfzr+@xPRgyY4=+FOQ8prWhN|0r0R4ZLpicr`;fv5~3|vQKW*VSsErpxEJT-vS zh>CLn*2!DI2L?b@#oOEd(_jl`YIcSA@=f(-q6=hCN5ycLpZ6v@0DC&|e=5KI)qV62 zFVtQQ1~n3bcl@Wr5_*;Y>ZSwuP41>6pkb0ys~9QZWS8#$7P&z;W_ML)w{R@U5O|7y za7lhTL0P2m1*y=tXF(JR)w?l#Hg2ezr42Gv8rOoN`JWAyWGDZg*Do?i9MBsiHAD{p zyJXv+*6;s0*!_D2{)@26iGy@PAm(ng5G;R^RpsHRJue$5aet9OktNasT$y zIUIaB0%A4CWSm`{K$2i?fB~dQEG%blbdYm`nt@_Pz6|J6pfBif182f7*awL)WT$K? zSc1Qt7JzuY240h2P7CnA|8G_vwCeXT(YoT_F%j@m{Ax7#7nq1_qCj1in$-la<0AQW z-2XpP!tjXXSkfG-;avTFR?nTHBIV9xa1oWqN6Q3xBY)11_H;{_PVGo{U-+w}D#-c%kAH zm6z-_ug~-ADI0tA3saO}|Om;SN4G8XNy05qGr=XKO%c}b((i~wAH9`qkpCJ}2V zAJ)$?LS2gX5W!u)q6&OoH4pC@7R|!^9oH}y-`QtdbqT)@>m{3mJh79JM#D}+mfi@= zBMD%ni)mT}R~2_}O|MFZEOo?`AP9NyHTns25B28}QSzjB@>>sYXV&z1t(|oE_IhnS zwSMEJEn->{>t=hc`_j&RH5t%RwDM3Ot!4;v2`0;L5{TiCxPVsIUu_@Or*A896T(mn z`S%|-ykW}pE2}`A9Q%eL#8oVARf&D+DQI+c{Ntm;9Kw zx8qz@IN5)d1nH`mKz2y>u;Y7i(K$*UcsJtYX@=+nXrO-pEpC1-+3(o<4*+^WH&32A z_cQ+L^_35AXY9<#zYPMvU2|NU0g$NnV`~OzlL}{@ow0k7oP>VPsCEKz5(znQ{l;_a zaUJ*k(_o8s6wXTiBW>>BkY@SzB-eci?gSAIjwO|SS3k0IK9naSCphV3CBl@1Wurhh zpECM@3IVo;#&mMRrXEnA3uCNn_I%iSawYL(O1)DqY-s%n!>S}?V}v3d)Rm$LGn5|7 zUGCe@$&WMP@WxE};$%wN!kyCTM?k=8K+L^E8mVEg1LtPm=lY>VXK^LW7TKLxe}Df+pin)MDC4}~}l?brFm%<*zrV^Xt2#!KxU)VQN0e&(dR2$x$d{$GF zD|7A#ctz^h0|BDDLT;b!b{Ze&ROxZZ4~}DJAmffA8PJ6(e&X9}Tn-8w|C+(`y@Hy4 z?6H-uBtSd7$b`!y#=o7xyLQD47a{ta4Ye@qYQ$tUUC$0y&kikXm`brcZ2ThnfF$Hr zgFdX@+)RhOerzhRIFcXE$^Nnuaj3Z=6?D@s!@6_h$|?x{sjwgO02C=q)p(SR&7njC zfcMufV|&T{z5GvOpwHr-F|gbfadPU2+i3dSW5mBT20BK=d=A&qacBnDf1fo=GEb0X z1^9by-tMqVqp~%8r}UceiB90k#8<|&z4!L3i1Xn%o@_K@cbNR%LN!Y>_ytsY5RU{3 zt-sp>V>laB=TOCH!-3(!@!J$Y-Ga|vOr8G`tk8Ejq}NN|p+(d^{mDi+`STlo5ky>J zx!8?01ienc{>MS}8CC~%b*?^1s?*dqC@$q3+tg58r3e@Ym_GBpI3@p0$-7PgKy+1UvUh8H5;q~hEa6$mr zlie+>cV>z8P=l7y)hT&UUIxFo0axDq9VTuF1mYLRfx`*ZYH&KrLGUb;j*aEOs#6?I z^s7W~R$UqaEN1gwBgD_+bY>awSbr(l*6SHUaZ5JR!B0+u)(#8cp-=|!<6qkcP@S@u z)zca7%Y4%IyU@VfleW$#exsKy)z@(s&E0Q{w8fziCqBtgy${O!F-qI$6}+U) z-j&9hRS4`P=iSS8orK$I?Z7djQuh{I8`FXns4n&_2~dIltPn^;?>;XW0j^sYbXP(3 z8&q~dU;hj%Puv4TJ>(65*`4^41ROZB>~!59bjFeKZudWXb~zlN_ECtJ^4Gr=2+Xrn_BBED zZZ{tsIf#^y+KTs=oWPCq2!0Z7Pt_MdZN#mAO7lR^*DA;mI}{4XikQ6D6uj@5{hsd$ z=*?V+_&rwWwYqzhr*H|e4W*17y`p8y2}1L87wv-GpX%^R(_Q_hob9r0gIAAkk+`_p zxIQ#8SJje)xoSb1^TV<@TNQ3R7x2Pz;Idm5S1PQE68b2Hj` za|`9Yjq1UI^!-Tq?H$hbGE{a>rB5(JyUqnUtw zv;ugJ`^g_ohiVllt)YBHeM)@fi(Ve6fPwJ2NY>oSl z3|reDrWCw`eaHUTDc!v&&m_yaw%-0DZUKoMc>h%on?zZOxmC@#5Qv^(7|!UT_(b^X z4)rtzKdb3n1)OSCU&fRB{XHwI?!$B^i}hkm>BG=Vjgd>USfecrI%jyJK|I24!rW`h z{8(Tzj_2HBr8$RHS03v5_7ottX2ILD*YpX!8yL`L_m}1o=#QX!@fV09Bhde%AB};L zMv`V75W;^_==GrPDc@rylJ(hxR#2h4ZyoHV=OBSx|BJEf_V ztSCi;{U^c&kb%cpf~ckDR3Fy!zpTm9PX9#L|2E!xK>DDUuWTnEDya}bdHI5hv}-K^ z+XFSE0hYe=FOqCEXz}Tjl$KO($N`k+ASxTmX<*^jmT?&bPVxZq6OIJIcbU)+7d%|( z`(3v!(`@d2Mdol9Tah*vqUmxMJ(t{Bv6)`Z96U7}UxkH6@ES6B4ed7929Z|E#-{M> zX{qUJT*pGbmm<2ZXb=I?K2M=tQN4Vy<_T0HtkS18RZ4w={QsoQ_+7V;F23?}952d; z8OxLnB`Js*wsS*%oNEb5Q7u;4ekKbq*iRP;gRZiHrt;m@X-}W?c+JkIP-eqV=01sFyX)-p6~7gSa}#98W}DLARW7{z30<|Qxww%V|F#uc8kyMb z7OZ~N4j$wm41&}X)-2A=ZwhhLgE*HGH{iw9{GhOuH3hhfuAlq`lP6Fr|1@(k?JV5o z#BSx})!U+^q zbC7@Yz*9)I81Aa zX7sV(@*zdvG@At1G3NN=X3aLfCb=@WLTNZ2G<~lgqiIYS0<&BedvbT?awFK8r9p}e z0zYZAGij={fM=Hq7Zjay-nTsWT8+W#n_cbd2rw2;Mm88yKkR%kRVDUZCSA2FI*4m4pNqY=sx8KV zm;-0K>(pg%+l8g!uWFuLjXgj+~stT8Zd zIUWTDNOBZ&8Vya``PDzX<_*AW{);&d10A?H=z!ikq@gR+V^^fr*#Xh9MFL;9c0eL2 z{h-G-%G-l7QugS#Gs|t_)NW7-lRZrAEsY>ur#UwN0|5pCiePTF2B!&0#V5>0d?sR+1QnKu>o|?ZMDfqPDkVE4uHnitvLGXjc?qlG^v-u z2dYjyG#27!qacN;ZH9gDN4`>c7DCmE@kG;Q54qXCgg5`5xh|Rq6!Og|iWzNhyf*d` zw+?6WxT&yv|6F*khiR1EG6-Px#Fxi!m77L*3tFH{bvkQSF4Wv5xHxlAmyt8Dx${%9 zs!(wV1?py_Oqbndrd{=62j+T^j~dC@l9$6B<%P`Esdl__W&?_@%xB`QJawarrD(FI z{1VkhZ^H(Ili_CG!ijYMOK_UT9QFisTtky;6+xd;Xds8hbVg!ywW6ff7LkQII&lZH zU}{Y(UUP5giHZf&I`E5w0#+VBlV}L_V9Jy3wRPh?nb3ZS1>(=-oQ<;>Hu1Ct_Zhz8 z#x27_FBfuoPP!E`X!$2$0~qn*l84&YRZKaZr5C^$ccO|*_wzCHXiqZ6w^23b@QDc2d0sqJdzTZfQYeTE zSaL!hWF-jtfnX&HW#4&B2*xq_;GU@S3K0KMW`Ei0t7iT&WlCNPux9 z41bS3qOV918s=j!UNrgX!-J8=m?x+_5Q)C?M*QE)*|>2wq%Pbx}X;B zbl@~kN6~M9TFr8f2qk-5s{UK^ZJlkqP^HBatntsbl>B3Vu>MDwi-0=%cM9@XPbMgt zKXXtT0e2dtxRE7uaNY;~!+GyxBC->=yFHx?8VOx~+c0J`1iL%89O^BBmrw&$n8G1n zSIW3nFL7ybw?Q*$WDoYu>)w$O2&cQbi6!2x(7dKzcejGu2`0UAMy3GCpG%6+dF3m? zsJw719=lv^UUv+MTIv(-Gw(gsW7p1nsH@jc%Y&!-SR#BJs-?GN!MP}^K zX4&`IYH=ckJu}vtDpMTb)nNH7xwW}#0W+Py;k_V?Nd&M3x;er#&0)Hc6BBWzdMVYI)t~! zvNf)#7vEPqkbuy7P&ilj{K`910hJun5VVy^bZ`B1cQ85hMEv48W%1`_Y!5YX#8_h1 zPR)-%TYXZmElF!r;G(kf2{iFx>T2-LOz^S8-b0S^U!e6F$<~PKsj}3LF5;>IrHwj4 zKnyowk|Lkm^sNa2eQWPu2STcfTZh*{ef#q?NE#EGSeJae&f<2^VRFHmDjinm_p7a> z#=*3LM`(m30d4TJR{VI6H-*!6ekqZVC!y3bigDy-l`_O z33p1~FG$p8H0|$2b|RT&dJWU#*WKb>x_54&+5nXMC3SkUc!q{HGuUL77@jRvA`k8RO-!((jdrLXx1(VyRtQZje=j~9u3bz zKK!Gx?C|_;^s_df@>&Cqgm{`zPcB6<_ikG%;|Nwm|1vPeVAc)Lf;z~8f_4|mGSMnz zHe2%?pV85I5lN>ZA$CTeCGAgRET$16FoAGu+F3+D473~ynldfw4(n7tp2|$|+thB# zC*B9Z5f&6NJ-k+aUU4c(xAnf;yZueiFXOjZc=US9B2dJYY?k-Y*D|D%u?53ssOm{U zTYe^Wt@7v;hQ;$S26C6J!Hj zyz(}UWeCQm78W(6SnOu3LSY%&+Bz;%{z|%YKP^MCc#x-T!CLewK^Rn;-_j+=AL%E> z)Z`l2Ck3)lh54~FSA!#=T42aCLMWzWJBMIgxMShAsZ#ONrAT1w5yJT!G1Ryl3vrFcO&sSpttVoX7QfMP9*|Jj+sB7*S*HY*aZ|O30 z)6*6`4T<^``d+5sTfGM7uL~3*4x$*2!Jy>~+Uub71rgA_Z0Bi|T*rvy)A+~lp!CPTQwGYK#6E8z+lhxEhr#qpBi|j`R1ke~zDT<&&R0VCR~7(= zurkyj1ir&xgyKPrY=KU(EVDXQQPj%~0B$zFmQI;dhw}MMU<_ldD>vD!XQ1ORlC9ZS zhT_g^6_7sh|VD>_;*8Q!IFvysG!Prtm%Y!${?L1CY;QNqpk!nCV%w8*Ekjf!MTvNy@8=MHvSJorNJ zfddN`dgcy(W3jD>)yER}zX>C?fjz1rE-PG(0^Ja{ek!sP~M& zS2~4(h_;W08^uzO8h#)BJY}OzfGY@E(VmN|pnQd?L0_Wc>8cN4D)jL2M!aKKG~0() zCrRSc5$zoUcq3wp(fKDs49Cs7ZHBqi0>Z*M)4JOAWIg2@#tKp2> zG=(jS&VCf`Iasx=A~ng?sr!=)l5_7Q?pdR^o3TXB#<{OLBBA|`fl;0dCp4a8QYA7! zVF>!G%bk(cMcrrcM5tD2j%%tM?u+4eGw9bWm%CnNx3J{?I1N00ygrj}Bu>1)XxGMN zirAi83BrXyw+!=4r1(IFL1!5EwTu+~;zsl0*&lj8!r`sJIg%(u<|LP3(FLOwdh6Mb z8e>V8LEJiAq$%;5a-N+ekkI*h*VS2-(e^T~$1lF+W~-v~Jt-jMx&4)O00~6EqV}ts z_Pu-sFQ{Wp_l&h@NJq20xLZcyxjoUir|8KqPeWk5uylU+vGYCO35RC$eZO&$GqGa+ zU^+_0KnVVFGvIkLAxwR)&|~4{feqO9D3h=p|3zEokuN8?c5lV6h{&vpWTIJ&`ZRG?n4f!7xq3pmX(x;2^0F=EX}Pm<>K4@R1iVQ|I;n(>D%rq_#}TLQpU)!z zLt=XEMUnT#e)F5}Ctv!&MIPWKafSN$q=d7z+r6dL#e?$`&oxEt@fs1~*@z7eH9&T8R;~ zplPSPFVbdvm4-(zS|{qNfJ@HpUe80^Y!@T7EvVlJnjb>=#&$01CLH0AyB1Gv-;;k} zJUq!COSR#_Nq!Sdc}qEkfA^=9@e~dj+T+xD$>d{_MQp zFJ~xU)m~r%b|Ito2GP+I6U|XQyU&a^o@GMx=47;kU#*bBv%H*Z?#h)qH)M}niPaCa zoS$nZ2C~Hv3^J=P2%YepvviTTzeq*eGrD`Vth~z&H$KT z?g&B@Q}p9ZeHR!78lRxb2UZF&mSh^WcISde&%D5-g5c!|d|_T`1DeL|6BVA-5-r8; zuWj_%TpXIZUz*=C9lZ5DbnF`>$}azG&jn>OozeHQ8dMrC5DPr+zXE0K7(xH23fTf= zGdXAVl$x8H=i7z8lBI_fpBoTQoV4{E@-Kmisk6d3?oE(CaI2sCvfN5}aT@4AwzII<2cLW_W#Z&B{T6}2>m@5_{aClQ>>>Ty{Wp@ ztfjJzUlS7F6~D7~8F&1U12Wl+iQ0Q@%#S+3jF;L4$>B{ypIH%GTvUuaeSiOqPwEP7 zYg6tOitdr~35(u6HcIXM-uh{>1u1c*l#Q}65|q_ZauxeSTTajgrzk|d3*X|8U*S)& z+PGeA{+_nC`E?A%jP3Ih>6+G=_l2)7RRrQSIyqGtIqNqCesDDHqX`*>p&6 zleVx$EVmGekSoA$*2`?yfA|W1k@<~bu=YX8yil`>R*2QCv+1G^2Qg=fn3F_GoU|o{ zs5QSDc-wb$cJ3VIgw(w_`DomWf~(~f+H6iNv)efLO@=U{ z<+J@OJxaZ_21X=WK}$JH$1qqsx{|t*QNZ28yzsH4-KlzMG#VGn2D+p$qreEwOXf$r zWIp05H%@eXeiv1XI}D*jj}wW7GzA0d)2UNkHH_;2$yq zA?g0sh%Ci}0m%*VYD2>yXbM*Q=rzRq7r!W(lv^Am5Tk=fPDsSSS=kj{Vz16FxJESt zr+GR!@qtS6v$fpDnpqzqjI$(M_2|W8F>7zo1KU`doaj)_*)9oCRAZj3Dc6Gdk##1%cYGT~$Pvm?9!iJMSAR>_Fn@y@Ru~)9Zv0t10OICciiv{iJh0PY>^aYNq z42Z4*P8fIp5=FUiR6=DHs*bxbLy&R-GXO1$RQBuh-aQUrtLZQchw&E}-0wZZxyJuT0d}q>gg7WJ096vdd2~-!UY)1}0n4{ch z?aMRLVqaqR(pMfRU9>C|HwsUMCy`uR^s_@K=`)n|GR-51q)uT{XK2LugI410hlS_k zN%Q8BI_YK{x|jRz4qIU^lrB=czvrmPGhMp>VBxt1|6}zEp|^*4V%}^+d4FfiP7e_a zyQq8zkR|}0_SX|0(vBYLq&Z1>uU${4d*z~Jzra_>LXaYbeA?n15y-VO$|ck+@7JCt zXf5b!CRU)+*oy1y#i7e|hRrOyy02|`-`%BHCR$tb)oyuW*F8 z>WZtCy;F7`4`gum5QN&UbVG-%d`~wA-ic%uWU#iTvk*h{#5mbSw1B)I7}z9JQdHWH zLH|evc%utS9I^GH(}^5|AgT(Qvak=YCTj#L$!=i5+H(?<)eERIKHE61uz_}gefnpt zYFo^qlikS{gV%1*g1k`L%bYlBtbujQS77jOf%z4bPu!3=^u&cN%qAk{mTia*wQbBU zb_x$QygpQ)n(3(j$|qRmS3m3+@U5Ai{iK;ftdfS|{DwKipZe8jbMH zT7G$l9^di%65}>Bg-`!9!>aGH#o5_mTtDKobgIv@)9QO+XKxHGfyP+B5T+ZDH>x|e zmlXwU0!S{IDwKD5%E|`;ve%G;>+t>%4JEC&1tOhxF5}S)#TvQ4?gUPvSuT|FY;Jx!U9cokx$*F*Xe@i+|h(X zwfs!tYmZ;U-Ilj3l5+T&C?7qTC*pF23yaaKX9LYNo9NmPou_(-!lOOR)tR55wP424 zfYV+<^!pSM6vb$bn^&4q{ddKXBDJ_xUm~R98AMP-GO71vIhC|9iaQW^B(2!qZ!yLY znzV!m`$o3R^tR#=2cHn*+ul2teXR9DVH}X|Bz0-bI!1fBXKv$W>|MHU{=%_L?3bW| zQmJA@>LRkv=m(3@ojJNKiZY#fo8J?@mI7aJq=aRojkx_rbyVxzdNuO4L^Ag15+`Gx z(KVYc^B>;b;^;SYt{<({4KWnfawGb2Xb=4D^5^pI5g>YMrYn_=rUnZ^KOn#xR+9Lm{NqDTevEDmU#i2}v`bKk%m7$^e# za))=u0MtkWrjZz-m#X~{IR>*)a%6Ku8OUi}Yv+VFUk>uPpY){%7I9QzF&p%ep>N|W z*UJ}^>kC8ALa=Xd0|7JLDd|0EvYrY0pi^3R;X+b{|$k&Jfy+zA@8ErMThBy1( z$)DW(^wCd}s_}#?Xtwc7M@l(!Y-h240WZ#X`&WpQ*I90ulg&quM^aKya%-Nz5Hte9 z#jY$jm7wHKzF}330?qG7!0%5?bNVYv^v$q+10S9?&KNy@xCLr5IdwcKh;a2f#yECe z3|~K@f@C(iSozt|5BWt%tamE~zj{P$AChJij&vQ`DVgp}48kAc2EMW7yF(zu|IEsV z;dCQdGyy|#dE&;hJx8~dp0J1N34L3b>Xtqjb3yFql(7_Y8#QlMP%D3T>Z=+yl|$-d zD{$6Si*0N)obWhj+8nfVH)cju(VSo(d*@pkh46yNfvg?`g{%$+GcS*R6_vydC+#AJ zH!UN2X`im$aN2J{DJboKg~G-u-oQPKDhSp8h;@VNx0b9h=^BQd3LCkHb^O*ZbW#!m zuvZuIP+6oqKIdoi_7yGxN`03!x7p_H74_!OEu~13i5fh9hEBp(4;4nYYv|^B?%;BZ zJLa@o_T;E{1I2^5{wmSf6*GR@Cqo5|6lYax%f~x2q-1+pf%ttkZzEv8Y_w1L@Nm?T zR$dbFVJ#t9it@S4GPR7l+0co?s8cyo`WGlf(wkV{##uL9bfinEGb;J6-=j_4$StU{ z!A#g!@(Hgrc`%}8`I)c){xbtj#5L!QPyXBOzgLaQcMUBe%5L_FIUJYOWUZR+S2@D{ z2~K->rzv;4iE1-27ipp#5Ho8C3NF3px;pJNYuO^n%D&DUR+w5QJi)+nsAjLRZ5!J1FB~&yB{n;h`cfByIgh2d| zDi|W1xD+#FUG9QU z%rVJpcD+@NciDV3;29wHXVIk4J_r0H?_Hn)1WBAy61L%h^;a>0?LR) z-)Re^B|Ers#L*sYhm_Ip5xp8E4*7{3T~qi$*Hglre4B;?-CH2{4%IOc5tNx}loJI) z-ul42j#C305BS1YLF7ncd}tgbnj2PusZ4q>D9heB00*2o$?=R!+U&4=ZRwH250g-vz(rWcKUtytz^d@l;8|e|y^mJ&Nr`<^ zgVwj24OJMVuYQCDWjsjrN<^I^M?oCEBIY_Xdx4>;JD+pfhEZ(ahhk&y+P{r~(?Aba z`zO?Rm82N~A7gnRg$$UbgW=kFUy|k<;@M;t#^4kn{57(pp^NQMVe}VJz8f9m9v3mY z6Fmwq$#kR^eIPW$7vhfT>?Kmm&d>`e7x^@a~ zxElQ-5b>C|I3+gaC6noXlf#kmI*Ca;pA)l^J5(5l9fm$Z9ulNREoGS4_)di&bt-C7 zMM9rlW~)#}&xZa9Ltp&~#o?)hT!dz>4EEy*Fg8ST0g3w_I+(+USB~BHX7&^*Ml5K{ zy5@%7xHg(EFa_&0MoV2X^yER-(sTLXxyVfq0}N(;!C}EHDB8yK{^|#PkDJ)cLx4fv zb;M)P)g?u6n{yq*@1i^gPqo=(Dad^#9=2`XI3{@JX)U7s6#r ziGrp{=2X}hj*ros&baY62CG*-H-KDhjNEuBr%Ir5pd_%_h6D{CU)HxGgCpRxkMF!Z zvwUSA7+U?;QElO74EH z$$+^?l3(3m^;8~z9?v)L6A#AIkuW|Qy7P*?ZLU@h^jcYd8{;P_g390>2|u#@c;A}_ z^()mr)nKwZaX8+w%ZAogk%@8k+&BnqO`)^BozHU3e^cG`s@s5|8M8>gg zRyN+juvTW5K*z8lGnbxu#qd1xUu8=Z-jX^bP#eagf)Pq)02!bx<{X6hmR9M9Fi z#ZDhkq0^7U5{RAx$Z2U4~N_u#U)#`dB2Q*K@at zA`RhnOtGX$_zC}zn&+HG^P#2#2FXdjCIHF{HhbJK-|;8mX(*|w zOXd;H6`Vzv>(?L#mfy%tr1CdKv4b0uHLH@Z8DkLH12Z&@Rvq6SRFzC(Xib-aO{n7g z`|dW1frwEtd0oW!4BF7$+4!B=+aX!4$SgC2B^b*l5`rGsJrG*_lG~0Hi~rQ?;92Vs zzcqMS(i1QMpS>EXo;S@tw_uv`{OljS`Q~vD-Y4vq5!!8I-W=kQ6n8MtLy37$6!}%*c+xGVc*VyadJEu$LycrKtQ7JAqAG*lBi>uB= zV-oO0BX2mn`$`?F22&f=T@1W>)DBgVI=A0@HA`M+ImXgG(W@O^6<3g$)g5EaNJ?6Y zrwwf@;1iysgh^Q%NY1W@CtYqA>NN(cTU#N{oPA~qp=oO=(QBO*f5CVhgZDy^c2{&g zCA_x8=txDgcy*LiZ?0$cx)7neO@+Ou+I&}JDy;V{Bfj3x)=hV_cPSIy(1i1^n9XNw zn23Ci!c*h0*#t`oeI^vg{4#&{!du443ff4FMg zWTU8y{#K&(h;|klH$H#9QsTM9;x2BfT(t;`^T#Q;kMiDV<9b(OV%O3WZqzH|j@eA* zO!Fip?<#Srs9mUhTdjvp<6BV8;KDnk!9a`1N@OG+H(BJ!wO(_?&923AK+SSGXba%Y zn|)7OZlQFHpSNHQy%oWHb1i$6Mo9D z9g%CtIhOaG=2g*qs#N)r!r*8dbDCiJqstDj2J;LOuj51CTi8eBLIFZv_fMSxVct0# z*~%BZRaM^UcfFVZ?jr~Pr{mYIHMgBT6?s8dm4@Qzpo!tz`}sGLC;C&kKPM}n?peAK zcj3t~ZVaNgBqZ|5sc}g8jDKrxg(O{Wv^^VvW!2|H^db~%F$aof?PhmMjPV0+r(Zq& z8d<%TQr{#Zhtql!V#K%Jd%a-idTg$e?CG+#`keCXJZH}7cH+Z=7!LB#$M+vNsu$|T zpV@eS%t8{qv>CV5|Gw|BOG&u+C6lg{fJxX%JyQg~8?TRv{mU<%HYEpsw~TscU+t<& zWfJvia|i(B>7z1fCTA3Q^ovW2112|1&W1Zy-FwiK`$k>xl*6Kj zmP`|)dE<@rCeKAAp^mF5PWelS^3d!DPvHU2w_m5e z9oM^Y=HA(3-eY`TW244kfCpt|+Tq4Ic+4nZe6iOye%E_5$UCzp+gTCJ^E|x9dTTJX zZAJ6*;X3%U^Q0pgR_ABe*@3i9_vk?N+6`6f;u_wV_*>frXK&B=<@MH_3Z3ook|{SR z^y+C-td0Nd=qBVo;}Y1cY-GJ|V?AHdhQ=y~Dq|I2Pl5013 zwiw{aMPAFhQhJ(J0#~*9#yJYu3`6a;jugiKIQ-_e?T&FcU#ORX5nfWe`>o*SE5Q%h zgtKTr+7}bbFNu0DOc1qPO1RRJhnc}e2ei4?emfn-r9|J=b9HiBdLwipVif*tK}*K* z0IRi;WUpB91?jbq;55lQ?RHSlE1BzJgXrDqZWzYAm9?f{rMAg({EXE0bAs;8)mixT zW#Zi$pLy}4Z}f<6*Dqo36a|AHClSM}BO3JBKfKS6L2Wdf^GP@7pa%BkJM`{r4k4Bu zWwPwJj5>0Vh{qti?cHcgJ zF7ZeX+#>csqq)*lJ+}jfiQOfIZ*lmcZ8w88Kj-kWdo$cNEVg2>)>vWQZiU!6wTO3f zTd+d~n%_xM$vuzc@$Eb4)q~?{WoDUWt!k1la)pls5^phm_W1GINQ&RyB2qjKc6iKO zT!;n1&f0QAPp9c|yj#z7Op+A~Q^u@!#E#wE2_`?+1SP8{8OyAsfNHs#Qq3acA9-}z zx*b#7f-g3D>|;3>pq@4olgmc|+m%k9i&FGTk;; zU?Xj<6i$jR+?O)7vP8Ux3u9hh>rbI`T6^}0yV68oW23mFK)@tjjlCwocI=dW=_{}1 z)^iPf)~dT@LQgu;R(SMsNM=49^|AUPf8-~iE!60GIo5U!R=?R9O9D0b5-ebIdb@&t zV@Vt?r5am^k8-B!%FZW*o^<^@fq6F_mR0qwT{}bh#Y9s0B7bg9o2WN4^N{ZEJlYdx z6Fq8^Z*T@eH3#2#P;ybnH5$B_IHTw*@6)ek|AQ*|RH*jIyy^1Pbdw^ty&5&A)V}mH z)P^r6=H^9crF>pAc7No#S*7QViYU1r79y%6tsSDK6)f60KIrCU++|sq{fOO~IZcC^ z7cF*rbKPrco$+nHpj&n1eSO|>@_q&lyUID8mTiAoNh}d2{!z62tpGtXS?}wC69b+$ zDvLbdtmP^+dG6f~LBEW{J84lEibOfZ^5iScly|-`8OSj_Z8?C~x+L_LTUzD@&l#hQ zQ7yUOez1ADzf{~QHrguAwEAv-_~P{Zz?xi;>+kNF%W$QT?3Z!-{IVs~jwIsTw;}3% z{6HSwWmx<)+nXm0Y{8DYZ@0#-?GG`%+~8TN^|)N7fc{6p4W8XG;hix}Y2%cXFu(^6C%I4PIHG^-^dgmKCjpr@dW0Qqg11rFl`lJVnRnM*{F>-iG1o% z=vQ`kOIqRZE$&Vos@uMlZYEp$==9){r9>J#{>$rw2Ocaa`EoqG9{rVT~_ET-9VLN5+3%W51Uluyj zXhrLg`{=iVKDQzBTs*gS{@vy!Zg%Rx`fu|4qIPcV)-grayxD$ZegPSi9egxsxA<&w zgg-Lcpx^nW%b8HHqg)+`%Ph?|sEqus$u*ImugI}^&NFa41UokV~nhrUY;fhnaT0F{&g@O6)uGX!~znx}t8^bJZRWC_yaNq6hGYcs2lo*>n zDwoTcc0g*)XzbL)6)D)ES}dAiW<<($pQGM5CV;1&R&2w=4B$?=1m$Y~-VvXB9j(!7 zZv*?TvASeWR&X|+VU(9x5w)&4+i~`_?Wxcqg;-dbZ&;^vmg|RA{@>qJjh=J%Lo)$) zGuMl|`zrEx?p`|f`s2t*-1NnsV_7A`=O-+b^%|{7)vlJA>A9Df9y)%UVLp6`(;0LA z(HW*b{qt6QHk0Cd&xGSX>$Dmx@xa%KAF>|_JWt%QQbOe98XNuisx{g+bx!Yk!8^^9 z_A2?-HPt`vUeZF`e`RFD!gO`kMVO`9`u#;`8Vx={bB_b3808b6B6(!(xL)Vl^R6F3Mv8$p(q3dL=vh3QUnYMN)ZJM ziVC3$NDW9!0BH)5-h%WN2$0a;+IY^n=iK|Hx<^|zk-M(ZGMAEywy6C4{`t1gw;m98oFm6DQJ15clF8}SLD?-<_u2jU8rAQqPY z!dU=;E!FQms)G>EbX>FJ#^&)9ZMl&TbsT6 zU}mchKU%f{D`RgtI@1U9Tf7lPI$@M`E(!XxZ!&#+uZ8ulj`>r&M*_fz^j%{#KN<uX{n1h~#^ z(_H;*vnyp#?hKebrTKEFrS-0q#MWK%G*T#<&cBkoJ&~~3*xibZrm>5!m$-eH-U4$* z^~?FJd_TtZ_#*S|5lQ#9HPrHR$={+~Sz)VT&P|)%4e=|6Op-WNPP6$w$ogLCuFXtR zHYahNa8iYx{Gy5R=Sc@nW{mfC#@iJ1ZvHFecH+~LZ+MCOsfX>eV%S%F_Rrj|j91CU zV7itA#V~rvwh$OK-wjP}(F@_!T&>E!tm}KREoGLJtG#`MGd28V?xLAYjm;mp^^A5mwVfl<(Y@WUOxLBt@*`oXgEAR4xB9 zdT(`Rw`3bOe33mlwK}QeUtcar{6_pvmyBhVzBN}p+p<$^x^Ikv{&D-Fw@=q293VB4uU?($QoWdSc`k>YZG`3%Vo$ii3bIJ-qlME^vaUTW z_UJrsQo$$1+oWvY_8-0}{whkJvmY&c^2? zhDBA-%=Ea&A8cSked)pOa)Zfc?J)0^s6Wd2`O^2U#fh>QlW1hSodwgb>I3OZ^%~T^ z=kabsW9fvFWVQlrc-J-Vq88`lJy)a?t~$M`8(O|KK(hC++RyAhB{mj>FhP-G})IF!-p}FzWn&&MZJs9(0-?7Sa7bDl6 zW9Y`WZ3!gSv3xVVo(Adh_g`PYiPYOZUysM#eVNFr1i%R=inHpraoJhuX=8>9B9++V zeAFxknl$}tQdY0|B%HF6pW0x>Qpw8~2rsl}A>njk;1}Xa(s#4e!G=)VQO)?keAwE%~%ZGU~G_amR~F3PGwp616z6I=HQL5 z9i&%>UdEk|5elf_7FyWNG3{o9;LT6o;H=NlNGu;Bst&bN7_(-f-!y7s6??yymcKl> z&F^;O9ILgWVzX|u@fTMIhKr#EoOq^Yb3N@zPSM2;94iTuuy-;3Tv?Q&OV7lnUk;cY zfBQx^h&qqNov1IUPev=_#F! z5PVTJEnP#(m>YfCo@q~0*XuqmbS9c6iz6Bk-OVOQRCOu5SbIA$MD_BJwDrm<>C-f1 z^pH6`eWc7FkNwPzfr#aeGSbqzJoMJ-aSWrvt-YNeKggm{c5Nyv&mY=u&Uc15IO+U0 zu_Z7>hW$;aP)kaOY-M!D%x`rmyGLakWcfnfgZC%H8Rt{f9ckoAoljaaS5vul z?>Cnp+Dk`}FAP6Odb+%O;FO4DHGl*Hg zq_LLlG#8s+SR{pfI#b^RupFN>CfB6h)vrlp9G^eoI;~7K_Q?p=>&+^!T`ToE}sDZzd3fvoGUncwQ?GB)}X)3$dFqi9luakUbM zEd=tCP0r|6p3W2jr8_0QrQ4;gI}8#?zquOkm0QZZ_mv?q z-ySliQMv)P+W&xmzP0{Gw@1cF`!T!T%=xEpuX`Gvp3zy|WR?`9ip8Q_rjq%o^wGlJ zsiAGW-`QJNc5d`;tM4plv5_f{`nH*_i1f#4SzM&tl~b4ddOX^?^k6=hEUasIq17`% zb?u3|=QJntN^~+s?ZCi6#_&5!%dyNSnd+Kv6X&xK#J16E@sVewH9p-c$(N(mI3U}x zInm#`Er~SrVWtYOrrNP<37w&u24vTdj#Re@C3hZtPMsSg+WqR-=jcq|9&`enm?#Mo z^G;i`2ne zZmZEQogY+{eOlA$GkD-ya5Rd8=kqY*7=}n9@q>6E)zi_OhLd$Th(GhgBea^mEQ>?G zZ$Oq@tG!a9C57ky{D}4RBGbI)ikLF4>VWUPN2n4LF^$F6=tPTVzCDw)(X8*x_JLz! zG>bfShKN5`z5#C zk$rXPm`X{!g!SFtw}xLE@_l)$kGYjF)I#P0t;hfoZ;wBHV*T66A#{g3*Aw6gg-R!Li=1%Y z5!{CGB!R50eb$4aPtg=pHKl3UT#Tl4KK$n~i%%`tn783A7b%7^T#He&yn(rI2d8|G z>|}*3IxsqWdd1kg$Kp&YN<*$KIHu^4Ec9<5 zWGm<<$YL-l2F7M(9j)^bc7X%U!BU(fKkWolU8B6+wGgprvX|sF5<~AOJb&0eXYlR( z-B6Ut0+a<(a~yeZ-bU(uBfc_xQ%b0__onwl1Ne;__C?y?ueLb|6lj~$i!Zz0l@fNT zk|tV@-Ym3Uu&25VU8%I{@EY1rFSd{*PTYT4Vp4zY*k?q|B1f}e{tC-3MOMnJE6PW> z)nvG>S*WVS&Xk-T@tc$HYfVR^=!smue3O(6;n&Z*Gp_PeP4e12na#Jmj}gL?0_D|j zv@SoAe;=m1j}hy9)yCieOk%N(i=H>dsv_xT%|HAxXBTh8e5)ny)L~g}mC_jCXCJ@( zZBH@9H-+7!axyg~F_`m5sJz3hiQ~1nx{~(2UdOavB^Sh-s#uEmdzRQs@xK<>h7o&L z+UD%qoPsem)YN{GKt$|xFtm4h8M`TNQdK7B6He2Cwh43oX4El)j9UWd~TMUMF;xJWojki`CB0JQ;gYdzSr>)KVTpZ`SeaWaEv7 z5|*6gk&C@VlxFhO+XkldGRCmiLUrxChZo_M#@=}5NM3aD_lM~Y>2NoN0snd5P*nGq zTR~!8gtCT8o?3w_Gp40q`d3z=?(?+ttK^+>yMV@6cUBl??#o{JXdTEE#k&1ShxEHw{F5M@-4 zFh<`T@8zYoNDsc)U>vE!&oOS3@$6M_@3|AZBw54KzXw~6&&eBLV#KrS%7wMsEr>E~ zA5r4qKo(@nE9ya(}k8(CWaf)c!6B+ zBECH7I3gd)2>pyCyrPuxc~|Id_c7pr(N7kSX&R|q8F8#N8#0r6$gR`9UlCC}CzJcs z!!of$we?51y`v0~dX1lo+KqUvWZ&VQ@`!L`(oRGt!Dl`@Q6tq>cTysU(i&5c9w^$p zw`3NEgQ?M*+`y=5Hh-yO}R=Wsc z*oXMzJV?Ao`?n!kvPaNiQo@|TRbGjQn?sESb&Wh)<{8axPYWrVz z35uD-7girg#0-R$npM5@B@6nFel@PM=B0W+fMswVM_JA**y((iVE;(v3r1;oNM#w! z?(!@*>#E#^_PQU8X$#QG*mR+NrD7=r-Ls@|R`ct82pXpJT*gQ;t|-?WVJTRPy$oAD z;m4f0vw?M2n{iEX^5*8fr#=A@Ep$}>hcdD}XUGImD9A$3Y(RV{__DJ*_j2)Eq&Ao> z`)OOrcdm0Kv(PLE&dzq+M2ipW^HZ`(7X!v@gk?N6&5cY73N27e^j^8&bNR9<7EIDw zE*&V}bN2wJVj3n})U3?kygj;xE#i9$Ht~eS_Z?lxDSrP!OC8rt&<2ftgz?Wz&e4|f z>V`S9IVYt{WLq8g*%KSpUPzke>*})=5YXL^^*4-iYCh*=^ za>RLH_q@qR<{GyGb1_!9_YgIS4Ma^|QQ1zoKBJ%s-o|!b6g#f#nwJ8%4)~Gr6+fCs zD=thf@(iKho)uz^3*neeB6q+W8l(PxGMtOx9atIIm2^m#!H4(S&as%Eb-G{F0k^W| zwQt&XGlv8*WvXuPv@Lhh5p+0DQ6%*deN>!(n3xJTR*V0%qg{C0#i=`8g&GtvT5hNC z0)(2bBzbkC@P=E*G6XzBgjW=hJ0#wBg=z3A68WsoP_GUsFn9HYH?_3x_>n8wwospqIch~aq%RY{t}U1h5%ge{8j-K+ z{RY}VLAb4B!01AGVfRKm{S^bb5de=ZV7AwEC^KZQ4S= zf;+?0@0)T|y)Fuqxf1NR7z~yWYas<6J7(boRGs>YTaPjVc|4*Cd9<{P2Z`xIMfav;^g)b3sVu=+|~3jXUe$UV#L3GTU8 zNB(+Q-%8apHnFH}2hny|5uUF3tbvE%T^7F+i|NPqeG0P_Dl*{gE~Y5V?5R_CcP-p( zuWxS4ZSrR#jiWbVt2!2C8^MhQPa?b47s(NnjP4`ZRF$gfR1KG=tb*Yn2!`7~hTP(9 zIPsP|PjsZg$JJRdrSC!AYzg6b#B;^6BWL$d@cN*PDh^)Qc@r&JaF~AWVF-tQ!Kf(} z+4kgURHN>8NjF~85cAB#U*@A4Ov(!Inca3^EetJ>{o6^JOVA|4O%H6;(1Gz&mFYy0 zxs0829zHY=(N)B@5h>}TWm3k9)a*xsSHnIFz(ZeA?RQus;W2020k9? zPMKheY$1fJT-Hi?g%Mw#Y7ldLDJ(SF$S;z1*#IVn#ZY~G+e6T1+U3;zkc+wt`Br+! z*L&m?JjAh!7u$4MWAo8ItIBRF#Tf3%K9)SYOR9FF*)kXN%u>CqAc zMk^KFGpeJtkXh4S(8_UMZ7-b6mj{N?wyWQj5vT0$RJKIfb=`{~p|bmVj^T3LAtagRzIIr0}{S%&VcOCB9(OnXAV#vU1 zQ)rCMtKcb$jdCT^WFjyBkZ^lVp#ZjA-=#qx;nkq&yHp^+-e1+ReR!V5&gnZTrnU-+ z1pz&;^`EaomBbGkEiJRX8w`Fl%a8{+@6IHgow&J$L*bV?{eG40!5@Q#w1W@Puz z#vRYh&9Jv$?eeqepSdJ7*EBS!44kJ>%qJX7xbxa#r0@Ktp1Y*FVoL8&`%_IuYgSCb zv^&NR&11{mEEt5+^xULHI}nW0?7!Lhz)(2e`PPvmD3^(7GCpz^ZC_Z`UnncXCt-|aNSyfW3Q)~uFW0|@T0Ak?HuU~qn!>)CJQ;z`tD+j>QQv&TR0 z>mV#ME{D*!Kkiik!GWHzks~0GUy^0tBNXtiBI=|S;=>W^mtbHBZR)ldMXF9SvUsW0 zxvi_gC?^ycZYU8v$RX;MJPW0p?4oX*L2RVze&j<|olbhz5LqVOWVYqjL4pibDs=x% z+b``VI3>yIq1yiE`eLEZ!i4ZAx(G9=u~mAm?)CUZoOx7-13wlg?=Dp?^Tp1*;ohW_ z(8O~R4P^4q?0M1q?e3FC#p4fOc|J=`8|z5ZVAdp5snDW!+9^p5?emIcQSD@^_})3I zo2`VS^ydjJUeg zZ4Z_AVzZC=wdIl`JGJDt`r;VftSR{yJ!zKY+=9jH>T}TKGZ>VrBg9t_x~>>`#uh7&*&wr83*fZoVhO+eQAuwo zphzm<5Cb$Q5S5m@6X4PxwfB{E@Ipv^u;F(QDot~bWc~mFT>JOmet>d1#&4Q%4cAse z{+R=u|KY-)&zQ&O>~)bB_7T>X+c1_iwu3xXr#2#eCiYeJ`*3|khrS5GaSQ99whL*A zy>d_z=TqOhB;``zKAAtYuwRGhbOw?5ZJ~LnwjlCW*c@0wp~rU-S}Z2ug(mZZv1qP? zmYqD!-falgYqa3h^EL8;cb6owV`X38RT>9}_-jMmlb7m?yFYUJx8FvSBOGO%$lu2E z2;@Ptl0j)L7~#6?W|(e0s<|nE>nfqsWhc;9?jTHG%GW)j!{vU`*vwQghWI2|Av#Oq zlKB!Y-U4v1(xFac%kSC74{(4=uISkUF z;|t+ZCqr_o@=_W{0AiD=+&Bc6>w^&UlYo zi+e3LiLW%3M-Qw_t_+(na*R}*zG674XC!%nC3Qi}%xFZo4LJ}0X)xRI*5u1q{c7N7%S zB+D2RglYT}4+-K9k>Ef4!hK$9SK7My*+!-x0VPhJb&-my)7r{1#1#DBh!1F(nbIId2fgx_evd~Z^HY|^#%J52lK#;o=@GB^V=nv-@gf; z3L-yv!p9Ha@KR@Iz#S+;rViZ+ct%%1usVcrbntvYrTg`a-t#*eHZPlAG&)c^W8nMU ztas8G9hP5Ax&IL3t)JW7Ze(sdEOBBt7Vo$#XL%*%r-!jmR~UB$EDdhFF!@&Q$Q-YN z40s9Bmq6jK^OeV-?H*orCwRZDCFvD>oXN_yk0-LL-Cy#)aTw7cWD=WTAI)c5^$=zN{(=Y zPO%-niIkEP4b8TLbUx{exk180iQg*^(aCE|WIrsk#Z+8#WyDwur-u}RYu4oDESYt9 zI|z$ixDZUp)zZPA7s_Sj3{KI&3K*KpU*wm+e3c<$&W!clCm^p0E_8=+!!Qw!8lGu# z{g>w%UgTWI^*~vE$c%i<_oMmv23S(sJcB$UKDq*B*kNF0PQT^pN_pRzzW9=ncrjnP zfiHdfWd+JRRn7{3@rvcx($7y-F*FOWiFTs|HRp9XD7o?-TrG0~YQJ8~pRRDT%=HEg zXSs@l*B^Ii35O2Z(icI%%A*QMOiNsf3#f$x64H1?!Uj-eY zAPG#+2{K|pDf;G^R9r`~JAl8wVhzL51rv7O!RsRXFCEb0-|xEH4;~r`buGsY#EIQ^ zg&p;$W2MSp9!HjaFZ*VLIB|F7J!^=>6%g}TM51-%WECaw zwmkCg^wuu$XLS}x={uD3OdcMrUR z9bT|^mPHHk3$Hm5{vnJ$NUBg3{glTCyfqTRt`vn^cY=tj1FMKt@ktWjJr!nTupGd> zdR+TTedL3^I?o?}y6U;i28)*5?6CBSY&8t3&$(HGGu4!Ut&o1lJOxB*?!k6987_)D#8~&8XG*0P{%6j;EO4-1cPoP z^F={RQL5eSP9GG5Qr)d>@xsSI?C1@_fkDbLNYR1XB~}d5zxuGy8`e0z1wWCrBIrf& zDr&>I50vB&qj+qCxN+!}8G8#K#DN47Dc&Oi+wyALBmIrGkF6s2#vKN2!*gaxm4AQz z`><_=C*9B&lys^zFos_mp7NcZy0wD`%f+EDZ9{~q4iPw-5P{)@QXg`ct z7YsJGIAY8>yBslHGZX;QcNl<(-4DX4j97MV?kWw`GwUFu=ajp#15|WsY-7Mfc;FyN z0cPdo^rR?FMIK!Q8*g}HvHg1-XN9T7#j$ED#>@JyW8GhmD2>4~{O5BWW3Gr->X_4I^%b6;Q|iv5ksgZi)8VkI z;3=O6bMAzeVd~yqEy0PNL^2|ek=p{ekBhkF6FAS$Ul1&j6Ff?M5-k(iDTd<(7!q(K z1Ew_`_S)WURi8Kj#s^55pu}+c=AX z{z~o|R9Z$63t=`r5R{f6#@mTl>L5HoNcK1^@(`5`=_{*8yg~B zRb36O%#>P@0iO6(OH1s*eL#zjl&pR4uH~F?t^XZZ2Jztghkrdin*}>y+UcNhk#)I zG_R)pw+;3uAcmmYUyr(UFqlN6{%q-mCk6?bYM>;0R2*vbtG_oN*89Mng#|{fyq53$ z_W-DUO}|4O_Y2%%K(F^J26(>!33*VY1W5y0wvc^U@B802)L$FppI?K6|T@-=0*ogK)hUcoKofj+o*v15hpbhEk|LT(by7K3IAYqh|Yuv7Y+< z8Vi!vRrm8^3V>dqzF&supvFb$1Xcln?{0A8;6DF)4E(#!{cA`6dMyLdg(s%UN&rte z_53bUd*1^UTHl5s{D%VAvfssc!+mNbkwn5P0a35w)k@DlSjFcclvB66H^AWkgWD-z zRy8y8fYL9&MmnUQ1B8jOqeG2$8nn^+W6dl&Rpov^f(}Ia9;No}2L?Nkun|DimfRy7 z@27%ht^Zw3fmqLr2cQog-vS2&4}am@fBgS%N95l}D6k_nx)cW5=onLA_uvswheZSNw~v`p>>bfm{z_ z-QUJnJ!pnMFgA2;=y%jh;xlELkV6zd2;5ZS$Ez4 z&xh6t;1l-$aT{+o$)kaBL$3ZYbo?J!3*=@XBmm-~yR}Yk?&SOnLRiA@0dP{uXA=0? zhlur|_NNmZ_`mH0|K_V9w*z^Yd-_m9s`4mwZRjrXQ?`yq8h`Lx$RdAte`;_JkVbkm z628HC>o#@h1kl0j5-ZS>e(}e@+r_^h?tk?}P?G>A1_mmF0}}~<9lVrN~NHsqqqjkT^@P* z{V>1!W9dN7juNm5w^!DgjxEbZLNfqEDwJKCzh4b@Fun6bd*Ef`&b1q=hjh-_2;=ig zm?UNUJ4I*+yzr@i6y`!kM8U1!Dn7!q3UdI2SO4^sP;$=vdiq57wfX%zNc}<|uvw2& zl{JicBS6TrRHsvMbm&dV%{rQe!LmaT_XDzu&t^Oz#Y9D2f6TWzn&8_h%#-nIJR2i#BhHfYF=2-{td`|W6 zddrH<2WoL4XCmbDqM=`wNE3;3D=oi%*3r}9e9U=`c}X3Oxv{c=;hocsc!(wGtPMEB zu=YpbvtvT}_9X;$*Qw)E+h7<=p_xq3{RU&m9WYi9VQ-KafXl@clo+eALTs%}$4fYL zL?5&CCHwL_yl&Rk(;qyF$IS7kG@19cSM{Ykx0-MI8ZWzPk1C6>xniBOL>!a4A}lozobB}-jX zB?6BF{6d@!erk>)4+UUa>~vwO!%>yuP!9sL(xP!j{3IAmfR1q0A4*aXD%-SY*+&;L z)t}#_a7=6GS5r4Snu-cC*Tfi<>2ooF)U0ME1IQ`Ye&zTfo$;FqTKs0C_=U(_iLa_t z)frr3tO)uJg7_w(Il#1YcG#J(K6=BY3wq6q0BA@GY3h~{zi(Rj-TH-3TR7FZvG84W zfU$JchRdpKj`I&8Y-`~b@>e^#gz8P`L8YxiN+9I901^b@Og$^dy%LIRf4uNeM~p01 zL$qcA=B%Qv^f=ORP9a%D;)X3HvZBS$yNl-2wevR-`8Y@p89>+11}k|yCnDiO-#+(He6f@h z3_EC?mNhWPpp6T`cj9N2Pf1Ul4D#ibh6Cn-wa@cLZ~g|1ejrLonX$iv8w`#Ckii*0 z^jL@r2lXR+ad#a4k-#z63ds#v4HRy4?vu znZF=sCm`O5gW|IP8TbNZ0}!r1#NdMv*!eF*#=xJFC}bl4AAm0q&qS^gihhU8|8L^D zFFgOhVmN;RUv!Y}`42+=4?+g);XerZpA?V(Amrc4>_2h!e~*v>vHD#*15WoLh|BAe z&abGXO8FhAgVLBCap25VD9rpj+W!wt=bzzvBX)ocg*9r6Xw`^j(WhBD84MX`N=EaPZ1$AD_|r;_wf zSNXSeO8_=I2DDYhZGY$*5PJX*m-~VM(Q!qeah7J|KV=9{7{ z&(tgzgLT|-jGPI7Nc(@udTMHM>mwix`sO8!r|Igk%lI1rxdK&p)*Ni48RcRxSO);@ zAjB)I5|>qVwjx7)zz$I!nyl3ce+az=-M@;fR_kv1xIm{B0~uayqph)J@iL>5&hwd(fv(AIvFe7Pn#Fhbbw)4nF>TbpTpwd>w0LqeCk7s|?-52T1d-e{zkAcj+bU zQPLd|hVp`c@Vu>@op*jtT?Xl1t*L>x517VRa6NKypj8{@2cPM*KvUqg0$cHKlqv9X z1da(8^VShZ5>087ObPgejXaWQkXCF9QaB*xh%QMBufy9>csE7-%(eqWg&BKZ5UD@= z)QbWqPdncCLJ)UiRawv;1uapD(~q-R!f`()^tK+}%mLrflv=>WZ?Msp$Dvo0aI+L{ zhu{D?X$$ml?@d+h)Ty^}bLaFo9uY07#|2&yyg^iaz5chIoghQA-WAlnD-B$kXJMPB zP$$s+Q*323kiWV0>#IlRB8vhmDQR3s=JxBX9nmg*B)rHbArZMF5;&a0;bR{IW^TwM zd9C>s;WM=squMZ~WA$Cvcu)4v5nL`5rLPG!fi&N^?MqVl+=$l)P>L~be}8Xb&RSp0 z+}@EeszUHSp&nam#+x!u&FKdcmOT)Kfs#1luypdOD%Kg~$)NsnSUI@%IVYBLlxENl z_!Z@9h?c(C(I4xC$LX+=s88bwb!A2*LhINXF59ND?79>EWWc(Gjvrc3WV|fn)}ZvJ zC3xcoTWsDk-?bild+T4%g)ys3=_nqbd{|xoc K>ZWPm@c&=;xvp3M literal 0 HcmV?d00001 diff --git a/src/lib/device/README.md b/src/lib/device/README.md new file mode 100644 index 000000000..99f83ecc0 --- /dev/null +++ b/src/lib/device/README.md @@ -0,0 +1,7 @@ +# Device Manager + +## HomeStore 4.x Disk Layout +1. max_num_chunks is decided by device size and min_chunk_size which is configurable by HomeStore consumer +2. Super SuperBlk (SSB) is the first meta blk to load Meta Service. All other System Meta Blks are chained together by loading the SSB + +![HomeStore_Disk_Layout](../../../docs/imgs/HomeStore_Disk_Layout2.png) From d8f287b0fcf604c9cbdc5206414b0ac84add9696 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Tue, 25 Mar 2025 15:37:27 -0700 Subject: [PATCH 077/130] issue: 669 update physical superblk's mgaic and product_name to distinguish homestore 4.x with 1.3 which is massive written in prod (#673) --- conanfile.py | 2 +- src/lib/device/hs_super_blk.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6a6b60aa4..baa9b523b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.4" + version = "6.7.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h index a539c1e56..9d0a3140d 100644 --- a/src/lib/device/hs_super_blk.h +++ b/src/lib/device/hs_super_blk.h @@ -75,7 +75,7 @@ struct disk_attr { }; struct first_block_header { - static constexpr const char* PRODUCT_NAME{"OmStore"}; + static constexpr const char* PRODUCT_NAME{"HomeStore4x"}; static constexpr size_t s_product_name_size{64}; static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{4}; @@ -128,7 +128,7 @@ struct first_block { static constexpr uint32_t s_atomic_fb_size{512}; // increase 512 to actual size if in the future first_block // can be larger; static constexpr uint32_t s_io_fb_size{4096}; // This is the size we do IO on, with padding - static constexpr uint32_t HOMESTORE_MAGIC{0xCEEDDEEB}; // Magic written as first bytes on each device + static constexpr uint32_t HOMESTORE_MAGIC{0xABBECDCD}; // Magic written as first bytes on each device public: uint64_t magic{0}; // Header magic expected to be at the top of block From 15f2ecaaf8f8e06c7a485647b3d8fdeb78a409ea Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Mon, 24 Mar 2025 16:04:08 -0700 Subject: [PATCH 078/130] Use an atomic in crash simulator to wait for the crash only if the flip is set and triggered --- src/lib/common/crash_simulator.hpp | 4 ++++ src/lib/index/inplace_btree/wb_cache.cpp | 20 +++++++++++++++++-- .../test_common/homestore_test_common.hpp | 13 ++++++++++-- src/tests/test_index_crash_recovery.cpp | 2 ++ 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp index 788de1eac..e8826b61d 100644 --- a/src/lib/common/crash_simulator.hpp +++ b/src/lib/common/crash_simulator.hpp @@ -42,8 +42,12 @@ class CrashSimulator { } } + bool will_crash() const { return m_will_crash.load(); } + void set_will_crash(bool crash) { m_will_crash.store(crash); } + private: std::function< void(void) > m_restart_cb{nullptr}; + std::atomic m_will_crash{false}; sisl::urcu_scoped_ptr< bool > m_crashed; }; } // namespace homestore diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index 3a8e7b00c..1f9563060 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -216,39 +216,55 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) { // TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a // bunch of times. + // TODO: Need an API to check if a flip is triggered easilly to avoid the use of several atomics. if (parent_buf && parent_buf->is_meta_buf()) { // Split or merge happening on root if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) { // Its a split node situation if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_right_child")) { new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() != freed_node_bufs.size())) { // Its a merge nodes sitation if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_right_child")) { - if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + if (!new_node_bufs.empty()) { + new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); + } } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) { // Its a rebalance node situation if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_right_child")) { - if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + if (!new_node_bufs.empty()) { + new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); + } } } } diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 1a690948e..5f4568ca8 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -32,6 +32,10 @@ #include #include +#ifdef _PRERELEASE +#include "common/crash_simulator.hpp" +#endif + const std::string SPDK_ENV_VAR_STRING{"USER_WANT_SPDK"}; const std::string HTTP_SVC_ENV_VAR_STRING{"USER_WANT_HTTP_OFF"}; const std::string CP_WATCHDOG_TIMER_SEC{"USER_SET_CP_WD_TMR_SEC"}; // used in nightly test; @@ -209,8 +213,13 @@ class HSTestHelper { #ifdef _PRERELEASE void wait_for_crash_recovery() { - m_crash_recovered.getFuture().get(); - m_crash_recovered = folly::Promise< folly::Unit >(); + if(homestore::HomeStore::instance()->crash_simulator().will_crash()) { + LOGDEBUG("Waiting for m_crash_recovered future"); + m_crash_recovered.getFuture().get(); + m_crash_recovered = folly::Promise< folly::Unit >(); + homestore::HomeStore::instance()->crash_simulator().set_will_crash(false); + } + } #endif diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 2599b5306..5cff2a663 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -707,6 +707,8 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { this->get_all(); } } else { + // remove the flips so that they do not get triggered erroneously + this->remove_flip(flip); this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); } if (elapsed_time - last_progress_time > 30) { From a6db05bbd889e6a5e45d2c645aa775c7d194f20b Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Tue, 25 Mar 2025 13:58:31 -0700 Subject: [PATCH 079/130] retain the default behaviour of the method wait_for_crash_recovery to avoid wrong usage of the will_crash in crash simulator --- src/tests/test_common/homestore_test_common.hpp | 14 +++++++------- src/tests/test_index_crash_recovery.cpp | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 5f4568ca8..56013c9ec 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -212,14 +212,14 @@ class HSTestHelper { test_params& params(ServiceType svc) { return m_token.svc_params_[svc]; } #ifdef _PRERELEASE - void wait_for_crash_recovery() { - if(homestore::HomeStore::instance()->crash_simulator().will_crash()) { - LOGDEBUG("Waiting for m_crash_recovered future"); - m_crash_recovered.getFuture().get(); - m_crash_recovered = folly::Promise< folly::Unit >(); - homestore::HomeStore::instance()->crash_simulator().set_will_crash(false); + void wait_for_crash_recovery(bool check_will_crash = false) { + if(check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) { + return; } - + LOGDEBUG("Waiting for m_crash_recovered future"); + m_crash_recovered.getFuture().get(); + m_crash_recovered = folly::Promise< folly::Unit >(); + homestore::HomeStore::instance()->crash_simulator().set_will_crash(false); } #endif diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 5cff2a663..3e75854e6 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -403,7 +403,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void crash_and_recover(uint32_t s_key, uint32_t e_key) { // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); trigger_cp(false); - this->wait_for_crash_recovery(); + this->wait_for_crash_recovery(true); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); // this->print_keys("Post crash and recovery, btree structure: "); @@ -453,7 +453,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT trigger_cp(false); LOGINFO("waiting for crash to recover"); - this->wait_for_crash_recovery(); + this->wait_for_crash_recovery(true); if (!filename.empty()) { std::string rec_filename = filename + "_after_recovery.dot"; @@ -502,7 +502,7 @@ TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { // Trigger a cp, which should induce the crash and wait for hs to recover test_common::HSTestHelper::trigger_cp(false); - this->wait_for_crash_recovery(); + this->wait_for_crash_recovery(true); // Post crash, load the shadow_map into a new instance and compute the diff. Redo the operation this->reapply_after_crash(); From e5272ea64fe07fb02935eb2c028f0c1a3b85bf14 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Fri, 21 Mar 2025 12:56:01 +0800 Subject: [PATCH 080/130] Async IO metrics for physical dev. Signed-off-by: Xiaoxi Chen --- src/lib/device/physical_dev.cpp | 68 ++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp index 1b6914cf5..ba52ba2f2 100644 --- a/src/lib/device/physical_dev.cpp +++ b/src/lib/device/physical_dev.cpp @@ -35,6 +35,8 @@ namespace homestore { static std::mutex s_cached_dev_mtx; static std::unordered_map< std::string, iomgr::io_device_ptr > s_cached_opened_devs; +__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } + iomgr::io_device_ptr open_and_cache_dev(const std::string& devname, int oflags) { std::unique_lock lg(s_cached_dev_mtx); @@ -136,26 +138,50 @@ void PhysicalDev::close_device() { close_and_uncache_dev(m_devname, m_iodev); } folly::Future< std::error_code > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { @@ -174,46 +200,50 @@ folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, ui folly::Future< std::error_code > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); } -__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } - std::error_code PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); auto const start_time = get_current_time(); auto const ret = m_drive_iface->sync_write(m_iodev.get(), data, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); return ret; } std::error_code PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + return ret; } std::error_code PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_read(m_iodev.get(), data, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); return ret; } std::error_code PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); return ret; } std::error_code PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) { - return m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, wirte_io_size, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + return ret; } void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); } From 1457b11706e493f1279a7932466a40fa562c46b7 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 26 Mar 2025 15:46:04 +0800 Subject: [PATCH 081/130] update conan. The connan change was lost during merging. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index baa9b523b..e22abbfc2 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.5" + version = "6.7.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From 2a78255ad1eb486895b522ed9bc940b174b0f165 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Wed, 26 Mar 2025 18:08:35 +0800 Subject: [PATCH 082/130] Fix baseline resync corner cases. 1. Leader side: Deny snapshot read if there are uncommitted logs in the snapshot. This prevents the following scenario: If a crash occurs during snapshot creation, the snapshot might be persisted while the rd sb is not. This means the durable_commit_lsn is less than the snapshot's log_idx. Upon restart, the changes in uncommitted logs may or may not be included in the snapshot data sent by the leader, depending on the race condition between commit and snapshot read, leading to data inconsistency. 2. Follower side: Skip replay and commit when BR is in progress and purge logs is no longer supported. Purging logs can cause issues such as the commit thread being unable to access logs if they are purged. This change removes the purge logic and adds last_snapshot_lsn in sb to help determine if processing should be skipped. Replay/commit will be skipped for logs included in BR to avoid log ops accessing unavailable resources after the PG is destroyed by BR. --- conanfile.py | 2 +- .../log_store/home_raft_log_store.h | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 16 ++++++++++++- src/lib/replication/repl_dev/raft_repl_dev.h | 19 ++++++++++++--- .../repl_dev/raft_state_machine.cpp | 23 ++++++++++++++++++- 5 files changed, 55 insertions(+), 7 deletions(-) diff --git a/conanfile.py b/conanfile.py index e22abbfc2..7540f4c81 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.6" + version = "6.7.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index 7fb96a5d4..846b1de3c 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -217,7 +217,7 @@ class HomeRaftLogStore : public nuraft::log_store { /** * Purge all logs in the log store - * It is a dangerous operation and is only used in baseline resync now (purge all logs and restore by snapshot). + * It is a dangerous operation and not be used currently. */ void purge_all_logs(); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 71389cbae..7db163dff 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -69,6 +69,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->logstore_id = m_data_journal->logstore_id(); m_rd_sb->last_applied_dsn = 0; m_rd_sb->destroy_pending = 0x0; + m_rd_sb->last_snapshot_lsn = 0; m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1); m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); @@ -1503,6 +1504,11 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); + if (need_skip_processing(repl_lsn)) { + RD_LOGI("Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); + return; + } + // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn if (repl_lsn <= m_rd_sb->checkpoint_lsn) { return; } @@ -1597,7 +1603,7 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { std::memcpy(data_out->data_begin(), &msg, msg_size); } -bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data) { +bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s) { auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { @@ -1614,6 +1620,14 @@ bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data) { RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); return false; } + { + // Save last_snapshot_lsn, so that we can skip the replay/commit operation for logs included in baseline resync. + // The reason is baseline resync will clear existing resources on the upper layer, skipping replay/commit + // operations can avoid accessing unavailable resources + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->last_snapshot_lsn = s_cast< repl_lsn_t >(s.get_last_log_idx()); + m_rd_sb.write(); + } if (msg->dsn > m_next_dsn) { m_next_dsn = msg->dsn; RD_LOGD("Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 01f5b1926..f58c5190e 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -25,6 +25,7 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { uint8_t is_timeline_consistent; // Flag to indicate whether the recovery of followers need to be timeline consistent uint64_t last_applied_dsn; // Last applied data sequence number uint8_t destroy_pending; // Flag to indicate whether the group is in destroy pending state + repl_lsn_t last_snapshot_lsn; // Last snapshot LSN follower received from leader uint32_t get_raft_sb_version() const { return raft_sb_version; } }; @@ -230,9 +231,9 @@ class RaftReplDev : public ReplDev, if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } return ready; } + // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. void purge() override { - // clean up existing logs in log store - m_data_journal->purge_all_logs(); + RD_REL_ASSERT(false, "NOT SUPPORTED YET"); } std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { @@ -324,6 +325,18 @@ class RaftReplDev : public ReplDev, */ void force_leave() { leave(); } + /** + * \brief This method is called to check if the given LSN is within the last snapshot LSN received from the leader. + * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync, which + * doesn't need any more operations (e.g., replay, commit). + * + * \param lsn The LSN to be checked. + * \return true if the LSN is within the last snapshot LSN, false otherwise. + */ + bool need_skip_processing(const repl_lsn_t lsn) { + return lsn <= m_rd_sb->last_snapshot_lsn; + } + protected: //////////////// All nuraft::state_mgr overrides /////////////////////// nuraft::ptr< nuraft::cluster_config > load_config() override; @@ -365,7 +378,7 @@ class RaftReplDev : public ReplDev, void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); void create_snp_resync_data(raft_buf_ptr_t& data_out); - bool save_snp_resync_data(nuraft::buffer& data); + bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 710a56316..cb11a2955 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -186,6 +186,10 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) { int64_t lsn = s_cast< int64_t >(params.log_idx); + if (m_rd.need_skip_processing(lsn)) { + RD_LOGI("Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn); + return m_success_ptr; + } RD_LOGD("Raft channel: Received Commit message lsn {} store {} logdev {} size {}", lsn, m_rd.m_data_journal->logstore_id(), m_rd.m_data_journal->logdev_id(), params.data->size()); repl_req_ptr_t rreq = lsn_to_req(lsn); @@ -204,6 +208,10 @@ raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) { // when reaching here, the config change log has already been committed, and the new config has been applied to the // cluster + if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) { + RD_LOGI("Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); + return; + } RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx); @@ -320,6 +328,19 @@ void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, bool& is_last_obj) { + + // Ensure all logs snapshot included are committed to prevent the following scenario: + // If a crash occurs during snapshot creation, the snapshot might be persisted while the rd's sb is not. + // This means the durable_commit_lsn is less than the snapshot's log_idx. Upon restart, the changes in + // uncommitted logs may or may not included in the snapshot data sent by leader, + // depending on the racing of commit vs snapshot read, leading to data inconsistency. + if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) { + RD_LOG(WARN, "not ready to read because there are some uncommitted logs in snapshot, " + "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", + s.get_last_log_idx(), m_rd.get_last_commit_lsn()); + return -1; + } + // For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer. // We use the highest bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application. if (is_hs_snp_obj(obj_id)) { @@ -352,7 +373,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, bool is_last_obj) { if (is_hs_snp_obj(obj_id)) { // Homestore preserved msg - if (m_rd.save_snp_resync_data(data)) { + if (m_rd.save_snp_resync_data(data, s)) { obj_id = snp_obj_id_type_app; LOGDEBUG("save_snp_resync_data success, next obj_id={}", obj_id); } From bed045aef7084c0484036e3586ddbe832d097345 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Wed, 2 Apr 2025 14:27:13 +0800 Subject: [PATCH 083/130] Support blk reservation Application has an option to reserve some blks in a chunk when allocation. --- conanfile.py | 2 +- src/include/homestore/blk.h | 1 + src/lib/blkalloc/append_blk_allocator.cpp | 8 ++++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7540f4c81..7fab131b8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.7" + version = "6.7.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index dc8a1f3de..96cec5272 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -251,6 +251,7 @@ VENUM(BlkAllocStatus, uint32_t, struct blk_alloc_hints { blk_temp_t desired_temp{0}; // Temperature hint for the device + std::optional< uint32_t > reserved_blks; // Reserved blks in a chunk std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation std::optional committed_blk_id; // blk id indicates the blk was already allocated and committed, don't allocate and commit again diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index eca445381..141d09279 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -67,9 +67,13 @@ BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { return alloc(1 // If we want to change above design, we can open this api for vector allocation; // BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) { - if (available_blks() < nblks) { + auto avail_blks = available_blks(); + if (hint.reserved_blks) { + avail_blks = avail_blks > hint.reserved_blks.value() ? avail_blks - hint.reserved_blks.value() : 0; + } + if (avail_blks < nblks) { // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks()); + LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved blks): {}", nblks, available_blks(), avail_blks); return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. From ff6cb3c1aba35e045773dc05c526fe6cbfe9e731 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Tue, 1 Apr 2025 14:35:30 +0800 Subject: [PATCH 084/130] Adding group_id to RD_LOG rdev name (e.g rdev1) is human friendly but not visible outside of the Homestore, only group_id. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 10 +++-- .../replication/log_store/repl_log_store.cpp | 1 + .../replication/log_store/repl_log_store.h | 1 + src/lib/replication/repl_dev/common.cpp | 4 +- .../replication/repl_dev/raft_repl_dev.cpp | 17 ++++++-- src/lib/replication/repl_dev/raft_repl_dev.h | 10 +++-- .../repl_dev/raft_state_machine.cpp | 2 +- .../replication/repl_dev/raft_state_machine.h | 40 +++++++------------ .../replication/repl_dev/solo_repl_dev.cpp | 11 ++--- src/lib/replication/repl_dev/solo_repl_dev.h | 6 +-- 10 files changed, 53 insertions(+), 49 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 93eca48c4..7cd15bc0d 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -29,6 +29,7 @@ struct repl_req_ctx; using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; using repl_req_ptr_t = boost::intrusive_ptr< repl_req_ctx >; +using trace_id_t = u_int64_t; VENUM(repl_req_state_t, uint32_t, INIT = 0, // Initial state @@ -385,7 +386,7 @@ class ReplDevListener { } /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id){}; + virtual void on_log_replay_done(const group_id_t& group_id) {}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -416,7 +417,7 @@ class ReplDev { /// @param ctx - User supplied context which will be passed to listener /// callbacks virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) = 0; + repl_req_ptr_t ctx, trace_id_t tid = 0) = 0; /// @brief Reads the data and returns a future to continue on /// @param bid Block id to read @@ -427,13 +428,14 @@ class ReplDev { /// @return A Future with std::error_code to notify if it has successfully read the data or any error code in case /// of failure virtual folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) = 0; + bool part_of_batch = false, trace_id_t tid = 0) = 0; /// @brief After data is replicated and on_commit to the listener is called. the blkids can be freed. /// /// @param lsn - LSN of the old blkids that is being freed /// @param blkids - blkids to be freed. - virtual void async_free_blks(int64_t lsn, MultiBlkId const& blkid) = 0; + virtual folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, + trace_id_t tid = 0) = 0; /// @brief Try to switch the current replica where this method called to become a leader. /// @return True if it is successful, false otherwise. diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 072d06b99..8fa5c0f18 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -107,6 +107,7 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { } std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } +std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); } bool ReplLogStore::compact(ulong compact_upto_lsn) { RD_LOG(DEBUG, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); diff --git a/src/lib/replication/log_store/repl_log_store.h b/src/lib/replication/log_store/repl_log_store.h index a386d397b..bb19df119 100644 --- a/src/lib/replication/log_store/repl_log_store.h +++ b/src/lib/replication/log_store/repl_log_store.h @@ -30,6 +30,7 @@ class ReplLogStore : public HomeRaftLogStore { private: std::string rdev_name() const; + std::string identify_str() const; }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index b2ba6bce4..b733c19c0 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -31,9 +31,7 @@ ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool std::unique_lock< std::mutex > lg(m_state_mtx); if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { auto alloc_status = alloc_local_blks(listener, data_size); - if (alloc_status != ReplServiceError::OK) { - LOGERROR("Allocate blk for rreq failed error={}", alloc_status); - } + if (alloc_status != ReplServiceError::OK) { LOGERROR("Allocate blk for rreq failed error={}", alloc_status); } return alloc_status; } return ReplServiceError::OK; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 7db163dff..2be085f3a 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -80,6 +80,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb.write(); bind_data_service(); } + m_identify_str = m_rdev_name + ":" + group_id_str(); RD_LOG(INFO, "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " @@ -282,7 +283,7 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< } void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data, - repl_req_ptr_t rreq) { + repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } { @@ -301,12 +302,16 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, key, data.size, m_listener); + RD_LOGD("traceID [{}], repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size " + "[{}] bytes", + tid, rreq->rkey(), header.size(), key.size(), data.size); + // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); if (status != ReplServiceError::OK) { - RD_LOGD("Initializing rreq failed error={}, failing this req", status); + RD_LOGD("traceID [{}], Initializing rreq failed error={}, failing this req", tid, status); handle_error(rreq, status); return; } @@ -1050,11 +1055,15 @@ repl_req_ptr_t RaftReplDev::repl_key_to_req(repl_key const& rkey) const { } folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { + bool part_of_batch, trace_id_t tid) { + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } return data_service().async_read(bid, sgs, size, part_of_batch); } -void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { +folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another // journal. data_service().async_free_blk(bid); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index f58c5190e..bccdde53b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -156,6 +156,7 @@ class RaftReplDev : public ReplDev, nuraft_mesg::Manager& m_msg_mgr; group_id_t m_group_id; // Replication Group id std::string m_rdev_name; // Short name for the group for easy debugging + std::string m_identify_str; // combination of rdev_name:group_id replica_id_t m_my_repl_id; // This replica's uuid int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) shared< ReplLogStore > m_data_journal; @@ -205,10 +206,10 @@ class RaftReplDev : public ReplDev, //////////////// All ReplDev overrides/implementation /////////////////////// void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) override; + repl_req_ptr_t ctx, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) override; - void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + bool part_of_batch = false, trace_id_t tid = 0) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; AsyncReplResult<> become_leader() override; bool is_leader() const override; replica_id_t get_leader_id() const override; @@ -216,7 +217,8 @@ class RaftReplDev : public ReplDev, std::set< replica_id_t > get_active_peers() const; group_id_t group_id() const override { return m_group_id; } std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } - std::string rdev_name() const { return m_rdev_name; } + std::string rdev_name() const { return m_rdev_name; }; + std::string identify_str() const { return m_identify_str; }; std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } uint32_t get_blk_size() const override; repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index cb11a2955..8e20455ad 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -427,6 +427,6 @@ nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() { void RaftStateMachine::free_user_snp_ctx(void*& user_snp_ctx) { m_rd.m_listener->free_user_snp_ctx(user_snp_ctx); } -std::string RaftStateMachine::rdev_name() const { return m_rd.rdev_name(); } +std::string RaftStateMachine::identify_str() const { return m_rd.identify_str(); } } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 2b50fea7b..97de4ec3b 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -24,43 +24,33 @@ namespace homestore { class ReplicaSetImpl; class StateMachineStore; -#define RD_LOG(level, msg, ...) \ - LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ - fmt::make_format_args(file_name(__FILE__), __LINE__)); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ - fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ - return true; \ - }), \ - msg, ##__VA_ARGS__); +#define RD_LOG(level, msg, ...) LOG##level##MOD(replication, "[{}] " msg, identify_str(), ##__VA_ARGS__) #define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...) \ { \ assert_type##_ASSERT_CMP( \ val1, cmp, val2, \ [&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ - fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__)); \ sisl::logging::default_cmp_assert_formatter(buf, msgcb, std::forward< decltype(args) >(args)...); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \ return true; \ }, \ ##__VA_ARGS__); \ } #define RD_ASSERT(assert_type, cond, ...) \ { \ - assert_type##_ASSERT_FMT(cond, \ - ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ - fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ - return true; \ - }), \ - ##__VA_ARGS__); \ + assert_type##_ASSERT_FMT( \ + cond, ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + ##__VA_ARGS__); \ } #define RD_DBG_ASSERT(cond, ...) RD_ASSERT(DEBUG, cond, ##__VA_ARGS__) @@ -139,7 +129,7 @@ class RaftStateMachine : public nuraft::state_machine { void iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb); - std::string rdev_name() const; + std::string identify_str() const; int64_t reset_next_batch_size_hint(int64_t new_hint); int64_t inc_next_batch_size_hint(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 4a6a92144..ac3943e8c 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -28,7 +28,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi } void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t rreq) { + repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, @@ -92,11 +92,13 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { + bool part_of_batch, trace_id_t tid) { return data_service().async_read(bid, sgs, size, part_of_batch); } -void SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { data_service().async_free_blk(bid); } +folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { + return data_service().async_free_blk(bid); +} uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size(); } @@ -107,7 +109,6 @@ void SoloReplDev::cp_flush(CP*) { m_rd_sb.write(); } -void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ -} +void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ } } // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index abe966ffa..b1708d5d4 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -40,12 +40,12 @@ class SoloReplDev : public ReplDev { virtual ~SoloReplDev() = default; void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) override; + repl_req_ptr_t ctx, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) override; + bool part_of_batch = false, trace_id_t tid = 0) override; - void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; AsyncReplResult<> become_leader() override { return make_async_error(ReplServiceError::OK); } bool is_leader() const override { return true; } From d9614a5d43b9b339855184867c54a597a1cda331 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 11:29:21 +0800 Subject: [PATCH 085/130] Add traceID into repl_key the traceID replicate to follower through log and data channel(push_data). The target is we can use single traceID to get logs across replicas for the request. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 5 ++- src/lib/replication/push_data_rpc.fbs | 1 + src/lib/replication/repl_dev/common.cpp | 1 + src/lib/replication/repl_dev/common.h | 1 + .../replication/repl_dev/raft_repl_dev.cpp | 35 ++++++++++++------- .../repl_dev/raft_state_machine.cpp | 6 ++-- .../replication/repl_dev/solo_repl_dev.cpp | 2 +- src/tests/test_common/raft_repl_test_base.hpp | 2 +- 8 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 7cd15bc0d..447f235cf 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -57,6 +57,7 @@ struct repl_key { int32_t server_id{0}; // Server Id which this req is originated from uint64_t term; // RAFT term number uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + uint64_t traceID{0}; // tracing ID provided by application that connects logs. struct Hasher { size_t operator()(repl_key const& rk) const { @@ -67,7 +68,8 @@ struct repl_key { bool operator==(repl_key const& other) const = default; std::string to_string() const { - return fmt::format("server={}, term={}, dsn={}, hash={}", server_id, term, dsn, Hasher()(*this)); + return fmt::format("server={}, term={}, dsn={}, hash={}, traceID={}", server_id, term, dsn, Hasher()(*this), + traceID); } }; @@ -121,6 +123,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_key const& rkey() const { return m_rkey; } uint64_t dsn() const { return m_rkey.dsn; } uint64_t term() const { return m_rkey.term; } + uint64_t traceID() const { return m_rkey.traceID; } int64_t lsn() const { return m_lsn; } bool is_proposer() const { return m_is_proposer; } journal_type_t op_code() const { return m_op_code; } diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs index 1f6d20546..279fefcb5 100644 --- a/src/lib/replication/push_data_rpc.fbs +++ b/src/lib/replication/push_data_rpc.fbs @@ -2,6 +2,7 @@ native_include "boost/uuid/uuid.hpp"; namespace homestore; table PushDataRequest { + traceID: uint64; // traceID for the REQ issuer_replica_id : int32; // Replica id of the issuer raft_term : uint64; // Raft term number dsn : uint64; // Data Sequence number diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index b733c19c0..388d95015 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -54,6 +54,7 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { } m_journal_entry->code = m_op_code; + m_journal_entry->traceID = m_rkey.traceID; m_journal_entry->server_id = server_id; m_journal_entry->dsn = m_rkey.dsn; m_journal_entry->user_header_size = m_header.size(); diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index cb8a57931..880a8d30f 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -35,6 +35,7 @@ struct repl_journal_entry { uint16_t minor_version{JOURNAL_ENTRY_MINOR}; journal_type_t code; + uint64_t traceID; // traceID provided by application, mostly for consolidate logs. int32_t server_id; // Server id from where journal entry is originated uint64_t dsn; // Data seq number uint32_t user_header_size; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2be085f3a..6b42b52ee 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -187,9 +187,11 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ members.replica_in = member_in; sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); - rreq->init( - repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = 0}, + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { @@ -255,7 +257,10 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not // have a conflict with the old rreq. - rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = std::numeric_limits< uint64_t >::max()}, journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); @@ -297,14 +302,16 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = rreq->init( - repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, - key, data.size, m_listener); + auto status = rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = tid}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, + true /* is_proposer */, header, key, data.size, m_listener); RD_LOGD("traceID [{}], repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size " "[{}] bytes", - tid, rreq->rkey(), header.size(), key.size(), data.size); + tid, rreq->rkey(), header.size(), key.size(), data.size); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); @@ -373,7 +380,7 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list // Prepare the rpc request packet with all repl_reqs details builder.FinishSizePrefixed(CreatePushDataRequest( - builder, server_id(), rreq->term(), rreq->dsn(), + builder, rreq->traceID(), server_id(), rreq->term(), rreq->dsn(), builder.CreateVector(rreq->header().cbytes(), rreq->header().size()), builder.CreateVector(rreq->key().cbytes(), rreq->key().size()), data.size, get_time_since_epoch_ms())); @@ -430,7 +437,10 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d } sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()}; sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()}; - repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()}; + repl_key rkey{.server_id = push_req->issuer_replica_id(), + .term = push_req->raft_term(), + .dsn = push_req->dsn(), + .traceID = push_req->traceID()}; auto const req_orig_time_ms = push_req->time_ms(); RD_LOGD("Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); @@ -1549,7 +1559,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx jentry->value_size}; }; - repl_key const rkey{.server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn}; + repl_key const rkey{ + .server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 8e20455ad..12d987c9b 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -72,7 +72,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr jentry->value_size}; }; - repl_key const rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn}; + repl_key const rkey{ + .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; // Create a new rreq (or) Pull rreq from the map given the repl_key, header and key. Any new rreq will // allocate the blks (in case of large data). We will use the new blkid and transform the current journal entry's @@ -148,7 +149,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - repl_key rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn}; + repl_key rkey{ + .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; auto rreq = m_rd.repl_key_to_req(rkey); if ((rreq == nullptr) || (rreq->is_localize_pending())) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index ac3943e8c..93eef117c 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -30,7 +30,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } - auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, value.size, m_listener); HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 11c6d6bc2..8fe72ac1d 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -357,7 +357,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); } - repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, s_uniq_num); } void validate_db_data() { From 235ebaab9bc93a452105640e92994a67592f398a Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 15:35:48 +0800 Subject: [PATCH 086/130] Adopt traceID for all RD_LOG Also adjust some of the logging level. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 13 +- .../replication/log_store/repl_log_store.cpp | 18 +- src/lib/replication/repl_dev/common.cpp | 12 +- src/lib/replication/repl_dev/common.h | 6 +- .../replication/repl_dev/raft_repl_dev.cpp | 399 ++++++++++-------- src/lib/replication/repl_dev/raft_repl_dev.h | 10 +- .../repl_dev/raft_state_machine.cpp | 52 ++- .../replication/repl_dev/raft_state_machine.h | 16 +- 8 files changed, 284 insertions(+), 242 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 447f235cf..832a446b5 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -54,10 +54,10 @@ static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327; static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01; struct repl_key { - int32_t server_id{0}; // Server Id which this req is originated from - uint64_t term; // RAFT term number - uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry - uint64_t traceID{0}; // tracing ID provided by application that connects logs. + int32_t server_id{0}; // Server Id which this req is originated from + uint64_t term; // RAFT term number + uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + trace_id_t traceID{0}; // tracing ID provided by application that connects logs. struct Hasher { size_t operator()(repl_key const& rk) const { @@ -68,8 +68,7 @@ struct repl_key { bool operator==(repl_key const& other) const = default; std::string to_string() const { - return fmt::format("server={}, term={}, dsn={}, hash={}, traceID={}", server_id, term, dsn, Hasher()(*this), - traceID); + return fmt::format("server={}, term={}, dsn={}, hash={}", server_id, term, dsn, Hasher()(*this)); } }; @@ -123,7 +122,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_key const& rkey() const { return m_rkey; } uint64_t dsn() const { return m_rkey.dsn; } uint64_t term() const { return m_rkey.term; } - uint64_t traceID() const { return m_rkey.traceID; } + trace_id_t traceID() const { return m_rkey.traceID; } int64_t lsn() const { return m_lsn; } bool is_proposer() const { return m_is_proposer; } journal_type_t op_code() const { return m_op_code; } diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 8fa5c0f18..ca62c3197 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -10,7 +10,7 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { // We don't want to transform anything that is not an app log if (entry->get_val_type() != nuraft::log_val_type::app_log || entry->get_buf_ptr()->size() == 0) { ulong lsn = HomeRaftLogStore::append(entry); - RD_LOGD("append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(), + RD_LOGD(NO_TRACE_ID, "None-APP log: append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(), static_cast< uint32_t >(entry->get_val_type()), lsn, entry->get_buf().size()); return lsn; } @@ -19,7 +19,7 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { ulong lsn = HomeRaftLogStore::append(entry); m_sm.link_lsn_to_req(rreq, int64_cast(lsn)); - RD_LOGD("Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); return lsn; } @@ -33,7 +33,7 @@ void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); HomeRaftLogStore::write_at(index, entry); m_sm.link_lsn_to_req(rreq, int64_cast(index)); - RD_LOGD("Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); } void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { @@ -54,8 +54,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { } } - RD_LOGT("Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", start_lsn, count, - reqs->size(), proposer_reqs->size()); + RD_LOGT(NO_TRACE_ID, "Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", + start_lsn, count, reqs->size(), proposer_reqs->size()); if (!reqs->empty()) { // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule @@ -85,7 +85,9 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { // so skip waiting data written and mark reqs as flushed here. for (auto const& rreq : *proposer_reqs) { if (rreq) { - RD_LOGT("Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it", rreq->lsn()); + RD_LOGT(rreq->traceID(), + "Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it", + rreq->lsn()); rreq->add_state(repl_req_state_t::LOG_FLUSHED); } } @@ -95,7 +97,7 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { auto rreq = m_sm.lsn_to_req(lsn); if (rreq != nullptr) { if (rreq->has_state(repl_req_state_t::ERRORED)) { - RD_LOGE("Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string()); + RD_LOGE(rreq->traceID(), "Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string()); continue; } rreq->set_is_volatile(false); @@ -110,7 +112,7 @@ std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); } bool ReplLogStore::compact(ulong compact_upto_lsn) { - RD_LOG(DEBUG, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); + RD_LOGD(NO_TRACE_ID, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); m_rd.on_compact(compact_upto_lsn); return HomeRaftLogStore::compact(compact_upto_lsn); } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 388d95015..5d0f262f0 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -31,7 +31,10 @@ ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool std::unique_lock< std::mutex > lg(m_state_mtx); if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { auto alloc_status = alloc_local_blks(listener, data_size); - if (alloc_status != ReplServiceError::OK) { LOGERROR("Allocate blk for rreq failed error={}", alloc_status); } + if (alloc_status != ReplServiceError::OK) { + LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, + alloc_status); + } return alloc_status; } return ReplServiceError::OK; @@ -105,7 +108,7 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list if (hints_result.value().committed_blk_id.has_value()) { //if the committed_blk_id is already present, use it and skip allocation and commitment - LOGINFO("For Repl_key=[{}] data already exists, skip", rkey().to_string()); + LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, rkey().to_string()); m_local_blkid = hints_result.value().committed_blk_id.value(); add_state(repl_req_state_t::BLK_ALLOCATED); add_state(repl_req_state_t::DATA_RECEIVED); @@ -119,6 +122,7 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints_result.value(), m_local_blkid); if (status != BlkAllocStatus::SUCCESS) { + LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } @@ -134,7 +138,7 @@ void repl_req_ctx::set_lsn(int64_t lsn) { "Changing lsn for request={} on the fly can cause race condition, not expected. lsn {}, m_lsn {}", to_string(), lsn, m_lsn); m_lsn = lsn; - LOGTRACEMOD(replication, "Setting lsn={} for request={}", lsn, to_string()); + LOGTRACEMOD(replication, "[traceID={}] Setting lsn={} for request={}", rkey().traceID, lsn, to_string()); } bool repl_req_ctx::save_pushed_data(intrusive< sisl::GenericRpcData > const& pushed_data, uint8_t const* data, @@ -198,7 +202,7 @@ void repl_req_ctx::release_data() { // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { - LOGTRACEMOD(replication, "m_pushed_data addr={}, m_rkey={}, m_lsn={}", + LOGTRACEMOD(replication, "[traceID={}] m_pushed_data addr={}, m_rkey={}, m_lsn={}", rkey().traceID, static_cast< void* >(m_pushed_data.get()), m_rkey.to_string(), m_lsn); m_pushed_data->send_response(); m_pushed_data = nullptr; diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index 880a8d30f..cf8f53759 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -35,9 +35,9 @@ struct repl_journal_entry { uint16_t minor_version{JOURNAL_ENTRY_MINOR}; journal_type_t code; - uint64_t traceID; // traceID provided by application, mostly for consolidate logs. - int32_t server_id; // Server id from where journal entry is originated - uint64_t dsn; // Data seq number + trace_id_t traceID; // traceID provided by application, mostly for consolidate logs. + int32_t server_id; // Server id from where journal entry is originated + uint64_t dsn; // Data seq number uint32_t user_header_size; uint32_t key_size; uint32_t value_size; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 6b42b52ee..f3e907963 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -82,26 +82,26 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk } m_identify_str = m_rdev_name + ":" + group_id_str(); - RD_LOG(INFO, - "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " - "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} " - "log_dev={} log_store={}", - (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, - m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), - m_rd_sb->logdev_id, m_rd_sb->logstore_id); + RD_LOGI(NO_TRACE_ID, + "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " + "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} " + "log_dev={} log_store={}", + (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, + m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), + m_rd_sb->logdev_id, m_rd_sb->logstore_id); } bool RaftReplDev::bind_data_service() { - RD_LOG(INFO, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); + RD_LOGI(NO_TRACE_ID, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); bool success = false; #ifdef _PRERELEASE success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { - RD_LOGI("Resuming after slow down data channel flip"); + RD_LOGI(NO_TRACE_ID, "Resuming after slow down data channel flip"); on_push_data_received(rpc_data); })) { - RD_LOGI("Slow down data channel flip is enabled, scheduling to call later"); + RD_LOGI(NO_TRACE_ID, "Slow down data channel flip is enabled, scheduling to call later"); } else { on_push_data_received(rpc_data); } @@ -111,13 +111,13 @@ bool RaftReplDev::bind_data_service() { m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); #endif if (!success) { - RD_LOGE("Failed to bind data service request for PUSH_DATA"); + RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for PUSH_DATA"); return false; } success = m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); if (!success) { - RD_LOGE("Failed to bind data service request for FETCH_DATA"); + RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for FETCH_DATA"); return false; } return true; @@ -137,12 +137,21 @@ bool RaftReplDev::join_group() { AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum) { - LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), - boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + // Fixme: traceID for replace member + uint64_t trace_id = 0; + + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + RD_LOGI(trace_id, "Replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { // Two members are down and leader cant form the quorum. Reduce the quorum size. - reset_quorum_size(commit_quorum); + reset_quorum_size(commit_quorum, trace_id); } // Step 1: Check if leader itself is requested to move out. @@ -150,15 +159,16 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ // If leader is the member requested to move out, then give up leadership and return error. // Client will retry replace_member request to the new leader. raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); - RD_LOGI("Replace member leader is the member_out so yield leadership"); - reset_quorum_size(0); + RD_LOGI(trace_id, "Replace member leader is the member_out so yield leadership"); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } // Step 2. Add the new member. return m_msg_mgr.add_member(m_group_id, member_in.id) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_in, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout // when adding member. Member is added to cluster config until member syncs fully // with atleast stop gap. This will take a lot of time for block or @@ -169,15 +179,16 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ // can be resend and one of the add or remove can failed and has to retried. if (e.error() == nuraft::cmd_result_code::CANCELLED || e.error() == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { - RD_LOGW("Ignoring error returned from nuraft add_member {}", e.error()); + RD_LOGI(trace_id, "Ignoring error returned from nuraft add_member {}", e.error()); } else { - RD_LOGE("Replace member error in add member : {}", e.error()); - reset_quorum_size(0); + RD_LOGE(trace_id, "Replace member error in add member : {}", e.error()); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } } - RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), + RD_LOGI(trace_id, "Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), group_id_str()); // Step 3. Append log entry to mark the old member is out and new member is added. @@ -190,50 +201,53 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), - .traceID = 0}, + .traceID = trace_id}, journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - LOGERROR("Replace member propose to raft failed {}", err); - reset_quorum_size(0); + RD_LOGE(trace_id, "Replace member propose to raft failed {}", err); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); return make_async_error<>(std::move(err)); } - RD_LOGI("Replace member proposed to raft group_id={}", group_id_str()); + RD_LOGI(trace_id, "Replace member proposed to raft group_id={}", group_id_str()); // Step 4. Remove the old member. Even if the old member is temporarily // down and recovers, nuraft mesg see member remove from cluster log // entry and call exit_group() and leave(). return m_msg_mgr.rem_member(m_group_id, member_out.id) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { if (e.hasError()) { // Ignore the server not found as server removed from the cluster // as requests are idempotent and can be resend. if (e.error() == nuraft::cmd_result_code::SERVER_NOT_FOUND) { - RD_LOGW("Remove member not found in group error, ignoring"); + RD_LOGW(trace_id, "Remove member not found in group error, ignoring"); } else { // Its ok to retry this request as the request // of replace member is idempotent. - RD_LOGE("Replace member failed to remove member : {}", e.error()); - reset_quorum_size(0); + RD_LOGE(trace_id, "Replace member failed to remove member : {}", e.error()); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); return make_async_error<>(ReplServiceError::RETRY_REQUEST); } } else { - RD_LOGI("Replace member removed member={} from group_id={}", + RD_LOGI(trace_id, "Replace member removed member={} from group_id={}", boost::uuids::to_string(member_out.id), group_id_str()); } // Revert the quorum size back to 0. - reset_quorum_size(0); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); return make_async_success<>(); }); }); } -void RaftReplDev::reset_quorum_size(uint32_t commit_quorum) { - RD_LOGI("Reset raft quorum size={}", commit_quorum); +void RaftReplDev::reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id) { + RD_LOGI(trace_id, "Reset raft quorum size={}", commit_quorum); nuraft::raft_params params = raft_server()->get_current_params(); params.with_custom_commit_quorum_size(commit_quorum); params.with_custom_election_quorum_size(commit_quorum); @@ -277,7 +291,7 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { void RaftReplDev::use_config(json_superblk raft_config_sb) { m_raft_config_sb = std::move(raft_config_sb); } void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) { - RD_LOG(DEBUG, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term()); + RD_LOGD(NO_TRACE_ID, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term()); auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); auto result = m_listener->create_snapshot(snp_ctx).get(); auto null_except = std::shared_ptr< std::exception >(); @@ -294,7 +308,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& { auto const guard = m_stage.access(); if (auto const stage = *guard.get(); stage != repl_dev_stage_t::ACTIVE) { - RD_LOGW("Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); + RD_LOGW(tid, "Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); handle_error(rreq, (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING : ReplServiceError::SERVER_IS_LEAVING); @@ -309,16 +323,15 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, key, data.size, m_listener); - RD_LOGD("traceID [{}], repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size " - "[{}] bytes", - tid, rreq->rkey(), header.size(), key.size(), data.size); + RD_LOGD(tid, "repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size [{}] bytes", rreq->rkey(), + header.size(), key.size(), data.size); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); if (status != ReplServiceError::OK) { - RD_LOGD("traceID [{}], Initializing rreq failed error={}, failing this req", tid, status); + RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); handle_error(rreq, status); return; } @@ -326,14 +339,14 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& // If it is header only entry, directly propose to the raft if (rreq->has_linked_data()) { if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { - RD_LOGD("data blks has already been allocated and committed, failing this req"); + RD_LOGE(tid, "data blks has already been allocated and committed, failing this req"); handle_error(rreq, ReplServiceError::DATA_DUPLICATED); return; } #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("disable_leader_push_data")) { - RD_LOGD("Simulating push data failure, so that all the follower will have to fetch data"); + RD_LOGD(tid, "Simulating push data failure, so that all the follower will have to fetch data"); } else push_data_to_all_followers(rreq, data); #else @@ -368,7 +381,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } }); } else { - RD_LOGD("Skipping data channel send since value size is 0"); + RD_LOGT(tid, "Skipping data channel send since value size is 0"); rreq->add_state(repl_req_state_t::DATA_WRITTEN); auto raft_status = m_state_machine->propose_to_raft(rreq); if (raft_status != ReplServiceError::OK) { handle_error(rreq, raft_status); } @@ -394,7 +407,7 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list auto peers = get_active_peers(); auto calls = std::vector< nuraft_mesg::NullAsyncResult >(); for (auto peer : peers) { - RD_LOGD("Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string()); calls.push_back(group_msg_service() ->data_service_request_unidirectional(peer, PUSH_DATA, rreq->m_pkts) .via(&folly::InlineExecutor::instance())); @@ -405,12 +418,12 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list auto r = res.value(); if (r.hasError()) { // Just logging PushData error, no action is needed as follower can try by fetchData. - RD_LOGW("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", + RD_LOGI(rreq->traceID(), "Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", rreq->to_string(), r.error()); } } } - RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data Channel: Data push completed for rreq=[{}]", rreq->to_compact_string()); // Release the buffer which holds the packets rreq->release_fb_builder(); rreq->m_pkts.clear(); @@ -421,7 +434,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto const push_data_rcv_time = Clock::now(); auto const& incoming_buf = rpc_data->request_blob(); if (!incoming_buf.cbytes()) { - RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call"); + RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call"); rpc_data->send_response(); return; } @@ -430,7 +443,8 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t); auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes()); if (fb_size + push_req->data_size() != incoming_buf.size()) { - RD_LOGW("Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}", + RD_LOGW(NO_TRACE_ID, + "Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}", fb_size, push_req->data_size(), incoming_buf.size()); rpc_data->send_response(); return; @@ -443,11 +457,12 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d .traceID = push_req->traceID()}; auto const req_orig_time_ms = push_req->time_ms(); - RD_LOGD("Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); + RD_LOGD(rkey.traceID, "Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("drop_push_data_request")) { - LOGINFO("Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " + RD_LOGI(rkey.traceID, + "Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " "server_id={}, term={}, dsn={}", push_req->issuer_replica_id(), push_req->raft_term(), push_req->dsn()); rpc_data->send_response(); @@ -458,16 +473,17 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto rreq = applier_create_req(rkey, journal_type_t::HS_DATA_LINKED, header, key, push_req->data_size(), true /* is_data_channel */); if (rreq == nullptr) { - RD_LOG(ERROR, - "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " - "trigger a fetch explicitly if needed. rkey={}", - rkey.to_string()); + RD_LOGE(rkey.traceID, + "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " + "trigger a fetch explicitly if needed. rkey={}", + rkey.to_string()); rpc_data->send_response(); return; } if (!rreq->save_pushed_data(rpc_data, incoming_buf.cbytes() + fb_size, push_req->data_size())) { - RD_LOGD("Data Channel: Data already received for rreq=[{}], ignoring this data", rreq->to_string()); + RD_LOGT(rkey.traceID, "Data Channel: Data already received for rreq=[{}], ignoring this data", + rreq->to_string()); rpc_data->send_response(); return; } @@ -490,10 +506,12 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); + // if rreq create time is earlier than push_data receive time, that means the rreq was created by raft + // channel log. Otherwise set to zero as rreq is created by data channel. const auto data_log_diff_us = push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count() ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time) - : get_elapsed_time_us(push_data_rcv_time, rreq->created_time()); + : 0; auto const data_write_latency = get_elapsed_time_us(push_data_rcv_time); auto const total_data_write_latency = get_elapsed_time_us(rreq->created_time()); @@ -503,10 +521,11 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d HISTOGRAM_OBSERVE(m_metrics, rreq_push_data_latency_us, data_write_latency); HISTOGRAM_OBSERVE(m_metrics, rreq_total_data_write_latency_us, total_data_write_latency); - RD_LOGD("Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, " + RD_LOGD(rreq->traceID(), + "Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, " "data_write_latency_us={}, total_data_write_latency_us(rreq creation to write complete)={}, " "local_blkid.num_pieces={}", - rreq->to_string(), data_log_diff_us, data_write_latency, total_data_write_latency, + rreq->to_compact_string(), data_log_diff_us, data_write_latency, total_data_write_latency, write_num_pieces); } }); @@ -528,7 +547,7 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ // RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}", // rkey.to_string()); // RD_REL_ASSERT(blob_equals(user_key, rreq->key), "User key mismatch for repl_key={}", rkey.to_string()); - RD_LOGD("Repl_key=[{}] already received ", rkey.to_string()); + RD_LOGT(rkey.traceID, "Repl_key=[{}] already received ", rkey.to_string()); return rreq; } } @@ -550,13 +569,15 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } #endif if (status != ReplServiceError::OK) { - RD_LOGD("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); + RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), + status); // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be // retried. return nullptr; } - RD_LOGD("in follower_create_req: rreq={}, addr={}", rreq->to_string(), reinterpret_cast< uintptr_t >(rreq.get())); + RD_LOGD(rreq->traceID(), "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(), + reinterpret_cast< uintptr_t >(rreq.get())); return rreq; } @@ -570,7 +591,7 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< if (!rreq->has_linked_data()) { continue; } auto const status = uint32_cast(rreq->state()); if (status & uint32_cast(repl_req_state_t::DATA_WRITTEN)) { - RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data written and blkid mapped: rkey=[{}]", rreq->to_compact_string()); continue; } @@ -613,10 +634,10 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< HS_DBG_ASSERT(rreq->has_state(repl_req_state_t::DATA_WRITTEN), "Data written promise raised without updating DATA_WRITTEN state for rkey={}", rreq->rkey().to_string()); - RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data write completed and blkid mapped: rreq=[{}]", rreq->to_compact_string()); } #endif - RD_LOGT("Data Channel: {} pending reqs's data are written", rreqs->size()); + RD_LOGT(NO_TRACE_ID, "{} pending reqs's data are written", rreqs->size()); return folly::makeFuture< folly::Unit >(folly::Unit{}); }); } @@ -643,9 +664,9 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre // sometime before do an explicit fetch. This is so that, it is possible raft channel has come ahead of data // channel and waiting for sometime avoid expensive fetch. On steady state, after a little bit of wait data // would be reached automatically. - RD_LOG(DEBUG, - "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ", - only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode()); + RD_LOGD(NO_TRACE_ID, + "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ", + only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode()); // We are yet to support reactive fetch from remote. if (is_resync_mode()) { @@ -675,12 +696,12 @@ void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreq for (auto const& rreq : rreqs) { auto const cur_state = uint32_cast(rreq->state()); if (cur_state == uint32_cast(repl_req_state_t::ERRORED)) { - // We already received the data before, just ignore this data - RD_LOGD("Raft Channel: rreq=[{}] already errored out, ignoring the fetch", rreq->to_string()); + RD_LOGD(rreq->traceID(), "rreq=[{}] already errored out, ignoring the fetch", rreq->to_compact_string()); continue; } else if (cur_state == uint32_cast(repl_req_state_t::DATA_RECEIVED)) { // We already received the data before, just ignore this data - RD_LOGD("Raft Channel: Data already received for rreq=[{}], ignoring the fetch", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data already received for rreq=[{}], ignoring the fetch", + rreq->to_compact_string()); continue; } @@ -708,7 +729,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { entries.reserve(rreqs.size()); shared< flatbuffers::FlatBufferBuilder > builder = std::make_shared< flatbuffers::FlatBufferBuilder >(); - RD_LOGD("Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(), server_id()); + RD_LOGD(NO_TRACE_ID, "Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(), + server_id()); auto const& originator = rreqs.front()->remote_blkid().server_id; for (auto const& rreq : rreqs) { @@ -724,7 +746,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { RD_DBG_ASSERT_EQ(rreq->remote_blkid().server_id, originator, "Unexpected originator for rreq={}", rreq->to_string()); - RD_LOGT("Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator, + RD_LOGT(rreq->traceID(), + "Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator, rreq->to_string(), rreq->remote_blkid().blkid.to_string(), server_id()); } @@ -749,15 +772,15 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { auto const fetch_latency_us = get_elapsed_time_us(fetch_start_time); HISTOGRAM_OBSERVE(m_metrics, rreq_data_fetch_latency_us, fetch_latency_us); - RD_LOGD("Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us); if (!response) { // if we are here, it means the original who sent the log entries are down. // we need to handle error and when the other member becomes leader, it will resend the log entries; - RD_LOG(ERROR, - "Not able to fetching data from originator={}, error={}, probably originator is down. Will " - "retry when new leader start appending log entries", - rreqs.front()->remote_blkid().server_id, response.error()); + RD_LOGE(NO_TRACE_ID, + "Not able to fetching data from originator={}, error={}, probably originator is down. Will " + "retry when new leader start appending log entries", + rreqs.front()->remote_blkid().server_id, response.error()); for (auto const& rreq : rreqs) { // TODO: Set the data_received promise with error, so that waiting threads can be unblocked and // reject the request. Without that, it will timeout and then reject it. @@ -785,13 +808,14 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { auto const& incoming_buf = rpc_data->request_blob(); if (!incoming_buf.cbytes()) { - RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call"); + RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call"); rpc_data->send_response(); return; } auto fetch_req = GetSizePrefixedFetchData(incoming_buf.cbytes()); - RD_LOGD("Data Channel: FetchData received: fetch_req.size={}", fetch_req->request()->entries()->size()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received: fetch_req.size={}", + fetch_req->request()->entries()->size()); std::vector< sisl::sg_list > sgs_vec; std::vector< folly::Future< bool > > futs; @@ -815,15 +839,15 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ sgs_vec.push_back(sgs); if (originator != server_id()) { - RD_LOGD("non-originator FetchData received: dsn={} lsn={} originator={}, my_server_id={}", req->dsn(), lsn, - originator, server_id()); + RD_LOGD(NO_TRACE_ID, "non-originator FetchData received: dsn={} lsn={} originator={}, my_server_id={}", + req->dsn(), lsn, originator, server_id()); } else { - RD_LOGD("Data Channel: FetchData received: dsn={} lsn={}", req->dsn(), lsn); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received: dsn={} lsn={}", req->dsn(), lsn); } auto const& header = req->user_header(); sisl::blob user_header = sisl::blob{header->Data(), header->size()}; - RD_LOGD("Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string()); futs.emplace_back(std::move(m_listener->on_fetch_data(lsn, user_header, local_blkid, sgs))); } @@ -839,7 +863,7 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ } } - RD_LOGD("Data Channel: FetchData data read completed for {} buffers", sgs_vec.size()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData data read completed for {} buffers", sgs_vec.size()); // now prepare the io_blob_list to response back to requester; nuraft_mesg::io_blob_list_t pkts = sisl::io_blob_list_t{}; @@ -871,7 +895,7 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_DBG_ASSERT_GT(total_size, 0, "Empty response from remote"); RD_DBG_ASSERT(raw_data, "Empty response from remote"); - RD_LOGD("Data Channel: FetchData completed for {} requests", rreqs.size()); + RD_LOGD(NO_TRACE_ID, "Data Channel: FetchData completed for {} requests", rreqs.size()); for (auto const& rreq : rreqs) { auto const data_size = rreq->remote_blkid().blkid.blk_count() * get_blk_size(); @@ -882,8 +906,9 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_DBG_ASSERT_EQ(data_size, local_size, "Data size mismatch for rreq={} remote size: {}, local size: {}", rreq->to_string(), data_size, local_size); - RD_LOGD("Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.", - rreq->to_string()); + RD_LOGT(rreq->traceID(), + "Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.", + rreq->to_compact_string()); } else { auto const data_write_start_time = Clock::now(); COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); @@ -907,13 +932,15 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); - RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " + RD_LOGD(rreq->traceID(), + "Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " "total_write_latency_us={}, write_num_pieces={}", - rreq->to_string(), data_write_latency, total_data_write_latency, write_num_pieces); + rreq->to_compact_string(), data_write_latency, total_data_write_latency, write_num_pieces); }); - RD_LOGD("Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}", - rreq->to_string(), data_size, total_size, rreq->local_blkid().to_string()); + RD_LOGT(rreq->traceID(), + "Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}", + rreq->to_compact_string(), data_size, total_size, rreq->local_blkid().to_string()); } raw_data += data_size; total_size -= data_size; @@ -935,8 +962,8 @@ void RaftReplDev::commit_blk(repl_req_ptr_t rreq) { void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { // 1. call the listener to rollback + RD_LOGD(rreq->traceID(), "Rolling back rreq: {}", rreq->to_compact_string()); m_listener->on_rollback(rreq->lsn(), rreq->header(), rreq->key(), rreq); - // 2. remove the request from maps m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); m_repl_key_req_map.erase(rreq->rkey()); @@ -944,9 +971,9 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { // 3. free the allocated blocks if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = rreq->local_blkid(); - data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + data_service().async_free_blk(blkid).thenValue([this, blkid, rreq](auto&& err) { HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string()); - RD_LOGD("Rollback rreq: Releasing blkid={} freed successfully", blkid.to_string()); + RD_LOGD(rreq->traceID(), "Releasing blkid={} freed successfully", blkid.to_string()); }); } } @@ -964,7 +991,7 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { m_next_dsn.compare_exchange_strong(cur_dsn, rreq->dsn() + 1); } - RD_LOGD("Raft channel: Commit rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Raft channel: Commit rreq=[{}]", rreq->to_compact_string()); if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { leave(); } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { @@ -985,21 +1012,21 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) { // when reaching here, the new config has already been applied to the cluster. // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here. - + RD_LOGD(NO_TRACE_ID, "config commit on lsn {}", lsn); // keep this variable in case it is needed later (void) new_conf; auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) { - RD_LOGE("Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); + RD_LOGE(NO_TRACE_ID, "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); } } void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (!rreq->add_state_if_not_already(repl_req_state_t::ERRORED)) { - RD_LOGE("Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE(rreq->traceID(), "Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err); return; } @@ -1013,7 +1040,7 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) exist_rreq->to_string()); } if (err == ReplServiceError::DATA_DUPLICATED) { - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); m_listener->on_error(err, rreq->header(), rreq->key(), rreq); rreq->clear(); return; @@ -1047,7 +1074,7 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) void RaftReplDev::replace_member(repl_req_ptr_t rreq) { auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); - RD_LOGI("Raft repl replace_member commit member_out={} member_in={}", + RD_LOGI(rreq->traceID(), "Raft repl replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); m_listener->on_replace_member(members->replica_out, members->replica_in); @@ -1082,7 +1109,8 @@ folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkI AsyncReplResult<> RaftReplDev::become_leader() { return m_msg_mgr.become_leader(m_group_id).via(&folly::InlineExecutor::instance()).thenValue([this](auto&& e) { if (e.hasError()) { - RD_LOGE("Error in becoming leader: {}", e.error()); + RD_LOGE(NO_TRACE_ID, "Error in becoming leader: {}", e.error()); + decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } return make_async_success<>(); @@ -1123,9 +1151,10 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); } else { - RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", - p.id_, - my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx); + RD_LOGW(NO_TRACE_ID, + "Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", + p.id_, my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, + least_active_repl_idx); } } return res; @@ -1215,7 +1244,7 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { std::unique_lock lg{m_config_mtx}; (*m_raft_config_sb)["config"] = serialize_cluster_config(config); m_raft_config_sb.write(); - RD_LOGI("Saved config {}", (*m_raft_config_sb)["config"].dump()); + RD_LOGI(NO_TRACE_ID, "Saved config {}", (*m_raft_config_sb)["config"].dump()); } void RaftReplDev::save_state(const nuraft::srv_state& state) { @@ -1225,7 +1254,7 @@ void RaftReplDev::save_state(const nuraft::srv_state& state) { {"election_timer_allowed", state.is_election_timer_allowed()}, {"catching_up", state.is_catching_up()}}; m_raft_config_sb.write(); - RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); + RD_LOGI(NO_TRACE_ID, "Saved state {}", (*m_raft_config_sb)["state"].dump()); } nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { @@ -1267,7 +1296,7 @@ uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; } void RaftReplDev::permanent_destroy() { - RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); + RD_LOGI(NO_TRACE_ID, "Permanent destroy for raft repl dev group_id={}", group_id_str()); // let the listener know at first, so that they can cleanup persistent structures before raft repl dev is destroyed m_listener->on_destroy(group_id()); m_raft_config_sb.destroy(); @@ -1302,7 +1331,7 @@ void RaftReplDev::leave() { m_rd_sb->destroy_pending = 0x1; m_rd_sb.write(); - RD_LOGI("RaftReplDev leave group_id={}", group_id_str()); + RD_LOGI(NO_TRACE_ID, "RaftReplDev leave group_id={}", group_id_str()); m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete } @@ -1315,71 +1344,72 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, auto const& entries = raft_req->log_entries(); auto start_lsn = raft_req->get_last_log_idx() + 1; - RD_LOGD("Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my commited " - "lsn {} , leader commmited lsn {}", + if (entries.size() == 0) { + RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}", + raft_req->get_commit_idx()); + return ret; + } + RD_LOGT(NO_TRACE_ID, + "Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my " + "committed lsn {} , leader committed lsn {}", entries.size(), raft_req->get_last_log_term(), start_lsn, start_lsn + entries.size() - 1, m_commit_upto_lsn.load(), raft_req->get_commit_idx()); - if (!entries.empty()) { - RD_LOGT("Raft channel: Received {} append entries on follower from leader, localizing them", - entries.size()); - - auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); - auto last_commit_lsn = uint64_cast(get_last_commit_lsn()); - for (unsigned long i = 0; i < entries.size(); i++) { - auto& entry = entries[i]; - auto lsn = start_lsn + i; - auto term = entry->get_term(); - if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } - if (entry->get_buf_ptr()->size() == 0) { continue; } - // skipping localize for already committed log(dup), they anyway will be discard - // by nuraft before append_log. - if (lsn <= last_commit_lsn) { - RD_LOGT("Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn, - last_commit_lsn); - continue; - } - // Those LSNs already in logstore but not yet committed, will be dedup here, - // applier_create_req will return same req as previous one - auto req = m_state_machine->localize_journal_entry_prepare(*entry); - if (req == nullptr) { - sisl::VectorPool< repl_req_ptr_t >::free(reqs); - // The hint set here will be used by the next after next appendEntry, the next one - // always go with -1 from NuRraft code. - // - // We are rejecting this log entry, meaning we can accept previous log entries. - // If there is nothing we can accept(i==0), that maens we are waiting for commit - // of previous lsn, set it to 1 in this case. - m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); - return nuraft::cb_func::ReturnCode::ReturnNull; - } - reqs->emplace_back(std::move(req)); + auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); + auto last_commit_lsn = uint64_cast(get_last_commit_lsn()); + for (unsigned long i = 0; i < entries.size(); i++) { + auto& entry = entries[i]; + auto lsn = start_lsn + i; + auto term = entry->get_term(); + if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } + if (entry->get_buf_ptr()->size() == 0) { continue; } + // skipping localize for already committed log(dup), they anyway will be discard + // by nuraft before append_log. + if (lsn <= last_commit_lsn) { + RD_LOGT(NO_TRACE_ID, "Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn, + last_commit_lsn); + continue; } + // Those LSNs already in logstore but not yet committed, will be dedup here, + // applier_create_req will return same req as previous one + auto req = m_state_machine->localize_journal_entry_prepare(*entry); + if (req == nullptr) { + sisl::VectorPool< repl_req_ptr_t >::free(reqs); + // The hint set here will be used by the next after next appendEntry, the next one + // always go with -1 from NuRraft code. + // + // We are rejecting this log entry, meaning we can accept previous log entries. + // If there is nothing we can accept(i==0), that maens we are waiting for commit + // of previous lsn, set it to 1 in this case. + m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); + return nuraft::cb_func::ReturnCode::ReturnNull; + } + reqs->emplace_back(std::move(req)); + } - // Wait till we receive the data from its originator for all the requests - std::vector< repl_req_ptr_t > timeout_rreqs; - if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) { - for (auto const& rreq : timeout_rreqs) { - handle_error(rreq, ReplServiceError::TIMEOUT); - } - ret = nuraft::cb_func::ReturnCode::ReturnNull; + // Wait till we receive the data from its originator for all the requests + std::vector< repl_req_ptr_t > timeout_rreqs; + if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) { + for (auto const& rreq : timeout_rreqs) { + handle_error(rreq, ReplServiceError::TIMEOUT); } - sisl::VectorPool< repl_req_ptr_t >::free(reqs); + ret = nuraft::cb_func::ReturnCode::ReturnNull; } + sisl::VectorPool< repl_req_ptr_t >::free(reqs); if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); } return ret; } case nuraft::cb_func::Type::JoinedCluster: - RD_LOGD("Raft channel: Received JoinedCluster, implies become_follower"); + RD_LOGD(NO_TRACE_ID, "Raft channel: Received JoinedCluster, implies become_follower"); become_follower_cb(); return nuraft::cb_func::ReturnCode::Ok; case nuraft::cb_func::Type::BecomeFollower: { - RD_LOGD("Raft channel: Received BecomeFollower"); + RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeFollower"); become_follower_cb(); return nuraft::cb_func::ReturnCode::Ok; } case nuraft::cb_func::Type::BecomeLeader: { - RD_LOGD("Raft channel: Received BecomeLeader"); + RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeLeader"); become_leader_cb(); return nuraft::cb_func::ReturnCode::Ok; } @@ -1395,11 +1425,12 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, void RaftReplDev::flush_durable_commit_lsn() { if (is_destroyed()) { - RD_LOGI("Raft repl dev is destroyed, ignore flush durable commmit lsn"); + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore flush durable commit lsn"); return; } auto const lsn = m_commit_upto_lsn.load(); + RD_LOGT(NO_TRACE_ID, "Flushing durable commit lsn to {}", lsn); std::unique_lock lg{m_sb_mtx}; m_rd_sb->durable_commit_lsn = lsn; m_rd_sb.write(); @@ -1408,7 +1439,7 @@ void RaftReplDev::flush_durable_commit_lsn() { /////////////////////////////////// Private metohds //////////////////////////////////// void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { if (is_destroyed()) { - RD_LOGI("Raft repl dev is destroyed, ignore cp flush"); + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore cp flush"); return; } @@ -1430,8 +1461,8 @@ void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { m_rd_sb->last_applied_dsn = dsn; m_rd_sb.write(); m_last_flushed_commit_lsn = lsn; - RD_LOGD("cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, m_next_dsn.load(), - cp->to_string()); + RD_LOGD(NO_TRACE_ID, "cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, + m_next_dsn.load(), cp->to_string()); } cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { @@ -1439,8 +1470,8 @@ cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { auto const clsn = m_compact_lsn.load(); auto const dsn = m_next_dsn.load(); - RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", (void*)this, cp_lsn, - clsn, dsn, cp->to_string()); + RD_LOGD(NO_TRACE_ID, "getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", + (void*)this, cp_lsn, clsn, dsn, cp->to_string()); auto dev_ctx = std::make_shared< ReplDevCPContext >(); dev_ctx->cp_lsn = cp_lsn; dev_ctx->compacted_to_lsn = clsn; @@ -1461,7 +1492,7 @@ void RaftReplDev::gc_repl_reqs() { std::vector< repl_req_ptr_t > expired_rreqs; auto req_map_size = m_repl_key_req_map.size(); - RD_LOGI("m_repl_key_req_map size is {};", req_map_size); + RD_LOGI(NO_TRACE_ID, "m_repl_key_req_map size is {};", req_map_size); for (auto [key, rreq] : m_repl_key_req_map) { // FIXME: Skipping proposer for now, the DSN in proposer increased in proposing stage, not when commit(). // Need other mechanism. @@ -1471,7 +1502,8 @@ void RaftReplDev::gc_repl_reqs() { } if (rreq->dsn() < cur_dsn && rreq->is_expired()) { // The DSN can be out of order, wait till rreq expired. - RD_LOGD("legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}", + RD_LOGD(rreq->traceID(), + "legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}", rreq->to_string(), rreq->dsn(), cur_dsn, cur_dsn - rreq->dsn(), get_elapsed_time_sec(rreq->created_time())); expired_rreqs.push_back(rreq); @@ -1489,27 +1521,28 @@ void RaftReplDev::gc_repl_reqs() { return; } if (rreq->is_expired()) { - RD_LOGD("StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(), + RD_LOGD(rreq->traceID(), "StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(), get_elapsed_time_sec(rreq->created_time())); } }); - RD_LOGI("state_machine req map size is {};", sm_req_cnt); + RD_LOGT(NO_TRACE_ID, "state_machine req map size is {};", sm_req_cnt); for (auto removing_rreq : expired_rreqs) { // once log flushed, the commit progress controlled by raft if (removing_rreq->has_state(repl_req_state_t::LOG_FLUSHED)) { - RD_LOGI("Skipping GC rreq [{}] because it is in state machine", removing_rreq->to_string()); + RD_LOGT(removing_rreq->traceID(), "Skipping GC rreq [{}] because it is in state machine", + removing_rreq->to_string()); continue; } // do garbage collection // 1. free the allocated blocks - RD_LOGI("Removing rreq [{}]", removing_rreq->to_string()); + RD_LOGD(removing_rreq->traceID(), "Removing rreq [{}]", removing_rreq->to_string()); if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = removing_rreq->local_blkid(); - data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + data_service().async_free_blk(blkid).thenValue([this, blkid, removing_rreq](auto&& err) { HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string()); - RD_LOGD("GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); }); } // 2. remove from the m_repl_key_req_map @@ -1524,7 +1557,7 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); if (need_skip_processing(repl_lsn)) { - RD_LOGI("Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); + RD_LOGI(NO_TRACE_ID, "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); return; } @@ -1541,7 +1574,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ", + RD_LOGT(jentry->traceID, + "Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ", jentry->server_id, lentry->get_term(), repl_lsn, jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { @@ -1585,14 +1619,14 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx auto status = rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size, m_listener); if (status != ReplServiceError::OK) { - RD_LOGE("Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); + RD_LOGE(jentry->traceID, "Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); } // we load the log from log device, implies log flushed. We only flush log after data is written to data device. rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->add_state(repl_req_state_t::LOG_RECEIVED); rreq->add_state(repl_req_state_t::LOG_FLUSHED); - RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Replay log on restart, rreq=[{}]", rreq->to_string()); // 2. Pre-commit the log entry as in nuraft pre-commit was called once log appended to logstore. m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); @@ -1617,7 +1651,7 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { auto msg_size = sizeof(snp_repl_dev_data); msg.dsn = m_next_dsn; auto crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(&msg), msg_size); - RD_LOGD("create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc); + RD_LOGD(NO_TRACE_ID, "create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc); msg.crc = crc; data_out = nuraft::buffer::alloc(msg_size); std::memcpy(data_out->data_begin(), &msg, msg_size); @@ -1627,17 +1661,20 @@ bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { - RD_LOGE("Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, msg->protocol_version); + RD_LOGE(NO_TRACE_ID, "Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, + msg->protocol_version); return false; } auto received_crc = msg->crc; - RD_LOGD("received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, received_crc); + RD_LOGD(NO_TRACE_ID, "received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, + received_crc); // Clear the crc field before verification, because the crc value computed by leader doesn't contain it. msg->crc = 0; auto computed_crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), sizeof(snp_repl_dev_data)); if (received_crc != computed_crc) { - RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); + RD_LOGE(NO_TRACE_ID, "Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, + computed_crc); return false; } { @@ -1650,7 +1687,7 @@ bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s } if (msg->dsn > m_next_dsn) { m_next_dsn = msg->dsn; - RD_LOGD("Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); + RD_LOGD(NO_TRACE_ID, "Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); return true; } return true; @@ -1664,8 +1701,8 @@ bool RaftReplDev::is_resync_mode() { auto diff = leader_commited_lsn - my_log_idx; bool resync_mode = (diff > HS_DYNAMIC_CONFIG(consensus.resync_log_idx_threshold)); if (resync_mode) { - RD_LOGD("Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}", leader_commited_lsn, - my_log_idx, diff); + RD_LOGD(NO_TRACE_ID, "Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}", + leader_commited_lsn, my_log_idx, diff); } return resync_mode; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index bccdde53b..b571bc7ce 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -230,7 +230,9 @@ class RaftReplDev : public ReplDev, auto committed_lsn = m_commit_upto_lsn.load(); auto gate = m_traffic_ready_lsn.load(); bool ready = committed_lsn >= gate; - if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } + if (!ready) { + RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); + } return ready; } // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. @@ -269,12 +271,12 @@ class RaftReplDev : public ReplDev, // was a follower, m_traffic_ready_lsn should be zero on follower. RD_REL_ASSERT(existing_gate == 0, "existing gate should be zero"); } - RD_LOGD("become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); + RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); }; void become_follower_cb() { // m_traffic_ready_lsn should be zero on follower. m_traffic_ready_lsn.store(0); - RD_LOGD("become_follower_cb setting traffic_ready_lsn to 0"); + RD_LOGD(NO_TRACE_ID, "become_follower_cb setting traffic_ready_lsn to 0"); } /// @brief This method is called when the data journal is compacted @@ -378,7 +380,7 @@ class RaftReplDev : public ReplDev, void set_log_store_last_durable_lsn(store_lsn_t lsn); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); - void reset_quorum_size(uint32_t commit_quorum); + void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id); void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); }; diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 12d987c9b..69db144e7 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -32,7 +32,7 @@ static std::pair< sisl::blob, sisl::blob > header_only_extract(nuraft::buffer& b ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { rreq->create_journal_entry(true /* raft_buf */, m_rd.server_id()); - RD_LOGT("Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string()); auto* vec = sisl::VectorPool< raft_buf_ptr_t >::alloc(); vec->push_back(rreq->raft_journal_buf()); @@ -41,7 +41,7 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { sisl::VectorPool< raft_buf_ptr_t >::free(vec); if (append_status && !append_status->get_accepted()) { - RD_LOGE("Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(), + RD_LOGE(rreq->traceID(), "Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(), append_status->get_result_code()); return RaftReplService::to_repl_error(append_status->get_result_code()); } @@ -54,8 +54,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ", jentry->server_id, - lentry.get_term(), jentry->to_string()); + RD_LOGT(jentry->traceID, "Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ", + jentry->server_id, lentry.get_term(), jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; @@ -119,9 +119,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr out: if (rreq == nullptr) { - RD_LOG(ERROR, - "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", - rkey.to_string(), jentry->to_string()); + RD_LOGE(rreq->traceID(), + "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", + rkey.to_string(), jentry->to_string()); } return rreq; } @@ -180,7 +180,7 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa int64_t lsn = s_cast< int64_t >(params.log_idx); repl_req_ptr_t rreq = lsn_to_req(lsn); - RD_LOGD("Raft channel: Precommit rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Precommit rreq=[{}]", rreq->to_compact_string()); m_rd.m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); return m_success_ptr; @@ -188,22 +188,18 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) { int64_t lsn = s_cast< int64_t >(params.log_idx); + repl_req_ptr_t rreq = lsn_to_req(lsn); if (m_rd.need_skip_processing(lsn)) { - RD_LOGI("Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn); + RD_LOGI(rreq->traceID(), "Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn); return m_success_ptr; } - RD_LOGD("Raft channel: Received Commit message lsn {} store {} logdev {} size {}", lsn, - m_rd.m_data_journal->logstore_id(), m_rd.m_data_journal->logdev_id(), params.data->size()); - repl_req_ptr_t rreq = lsn_to_req(lsn); RD_DBG_ASSERT(rreq != nullptr, "Raft channel got null rreq for lsn={}", lsn); - RD_LOGD("Raft channel: Received Commit message rreq=[{}]", rreq->to_string()); + RD_LOGT(rreq->traceID(), "Raft channel: Received Commit message rreq=[{}]", rreq->to_string()); if (rreq->is_proposer()) { // This is the time to ensure flushing of journal happens in the proposer rreq->add_state(repl_req_state_t::LOG_FLUSHED); } - m_rd.handle_commit(rreq); - return m_success_ptr; } @@ -211,11 +207,11 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt // when reaching here, the config change log has already been committed, and the new config has been applied to the // cluster if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) { - RD_LOGI("Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); + RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); return; } - RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx); + RD_LOGD(NO_TRACE_ID, "Raft channel: Commit new cluster conf , log_idx = {}", log_idx); #ifdef _PRERELEASE auto& servers_in_new_conf = new_conf->get_servers(); @@ -235,15 +231,15 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt oss << "," << *it; } - RD_LOG(INFO, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, - m_rd.group_id_str()); + RD_LOGI(NO_TRACE_ID, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, + m_rd.group_id_str()); #endif m_rd.handle_config_commit(s_cast< repl_lsn_t >(log_idx), new_conf); } void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { - RD_LOGD("Raft channel: Rollback cluster conf , log_idx = {}", log_idx); + RD_LOGD(NO_TRACE_ID, "Raft channel: Rollback cluster conf , log_idx = {}", log_idx); // TODO:add more logic here if necessary } @@ -251,11 +247,11 @@ void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& int64_t lsn = s_cast< int64_t >(params.log_idx); repl_req_ptr_t rreq = lsn_to_req(lsn); if (rreq == nullptr) { - RD_LOG(ERROR, "Raft channel: Rollback lsn {} rreq not found", lsn); + RD_LOGE(NO_TRACE_ID, "Raft channel: Rollback lsn {} rreq not found", lsn); return; } - RD_LOGD("Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string()); + RD_LOGD(rreq->traceID(), "Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string()); m_rd.handle_rollback(rreq); } @@ -285,7 +281,7 @@ void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_p } uint64_t RaftStateMachine::last_commit_index() { - RD_LOG(DEBUG, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn())); + RD_LOGD(NO_TRACE_ID, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn())); return uint64_cast(m_rd.get_last_commit_lsn()); } @@ -295,7 +291,7 @@ void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) { // it is possible a LSN mapped to different rreq in history // due to log overwritten. Verify the rreq before removing auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq); - if (deleted) { RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } + if (deleted) { RD_LOGT(rreq->traceID(), "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } } void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { @@ -305,8 +301,8 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { rreq->set_created_time(); auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); if (!r.second) { - RD_LOG(ERROR, "lsn={} already in precommit list, exist_term={}, is_volatile={}", - lsn, r.first->second->term(), r.first->second->is_volatile()); + RD_LOGE(rreq->traceID(), "lsn={} already in precommit list, exist_term={}, is_volatile={}", lsn, + r.first->second->term(), r.first->second->is_volatile()); // TODO: we need to think about the case where volatile is in the map already, is it safe to overwrite it? } } @@ -337,7 +333,7 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, // uncommitted logs may or may not included in the snapshot data sent by leader, // depending on the racing of commit vs snapshot read, leading to data inconsistency. if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) { - RD_LOG(WARN, "not ready to read because there are some uncommitted logs in snapshot, " + RD_LOGW(NO_TRACE_ID, "not ready to read because there are some uncommitted logs in snapshot, " "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", s.get_last_log_idx(), m_rd.get_last_commit_lsn()); return -1; @@ -388,7 +384,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, snp_data->is_last_obj = is_last_obj; // We are doing a copy here. - sisl::io_blob_safe blob{static_cast(data.size())}; + sisl::io_blob_safe blob{static_cast< uint32_t >(data.size())}; std::memcpy(blob.bytes(), data.data_begin(), data.size()); snp_data->blob = std::move(blob); diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 97de4ec3b..7da37d5c5 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -24,7 +24,9 @@ namespace homestore { class ReplicaSetImpl; class StateMachineStore; -#define RD_LOG(level, msg, ...) LOG##level##MOD(replication, "[{}] " msg, identify_str(), ##__VA_ARGS__) +#define NO_TRACE_ID "n/a" +#define RD_LOG(level, traceID, msg, ...) \ + LOG##level##MOD(replication, "[traceID={}] [{}] " msg, traceID, identify_str(), ##__VA_ARGS__) #define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...) \ { \ @@ -69,12 +71,12 @@ class StateMachineStore; #define RD_REL_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >, val2, ##__VA_ARGS__) #define RD_REL_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >=, val2, ##__VA_ARGS__) -#define RD_LOGT(...) RD_LOG(TRACE, ##__VA_ARGS__) -#define RD_LOGD(...) RD_LOG(DEBUG, ##__VA_ARGS__) -#define RD_LOGI(...) RD_LOG(INFO, ##__VA_ARGS__) -#define RD_LOGW(...) RD_LOG(WARN, ##__VA_ARGS__) -#define RD_LOGE(...) RD_LOG(ERROR, ##__VA_ARGS__) -#define RD_LOGC(...) RD_LOG(CRITICAL, ##__VA_ARGS__) +#define RD_LOGT(traceID, ...) RD_LOG(TRACE, traceID, ##__VA_ARGS__) +#define RD_LOGD(traceID, ...) RD_LOG(DEBUG, traceID, ##__VA_ARGS__) +#define RD_LOGI(traceID, ...) RD_LOG(INFO, traceID, ##__VA_ARGS__) +#define RD_LOGW(traceID, ...) RD_LOG(WARN, traceID, ##__VA_ARGS__) +#define RD_LOGE(traceID, ...) RD_LOG(ERROR, traceID, ##__VA_ARGS__) +#define RD_LOGC(traceID, ...) RD_LOG(CRITICAL, traceID, ##__VA_ARGS__) // For the logic snapshot obj_id, we use the highest bit to indicate the type of the snapshot message. // 0 is for HS, 1 is for Application. From 36e00d9226b316c93f8b89b3eb6524f9322375aa Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 02:15:32 -0700 Subject: [PATCH 087/130] bump version Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 7fab131b8..b9500eed9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.8" + version = "6.8.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From de3cd1acaded3ec32e2c59a74b06238cd013fcca Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 23:50:38 +0800 Subject: [PATCH 088/130] Fix NPE Signed-off-by: Xiaoxi Chen --- src/lib/replication/repl_dev/raft_state_machine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 69db144e7..ebf262e8a 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -119,7 +119,7 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr out: if (rreq == nullptr) { - RD_LOGE(rreq->traceID(), + RD_LOGE(rkey.traceID, "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", rkey.to_string(), jentry->to_string()); } From 2fe4400cec2ed2467b2fd35d6d37b1cb277e222b Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 7 Apr 2025 16:05:45 +0800 Subject: [PATCH 089/130] Support custom rdev name - Add a set_custom_rdev_name function to support users to assign more meaningful name for debugging - Add repl_req_ctx into get_blk_alloc_hints --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 5 ++++- src/lib/replication/repl_dev/common.cpp | 2 +- src/lib/replication/repl_dev/common.h | 6 ++++++ src/lib/replication/repl_dev/raft_repl_dev.cpp | 5 +++-- src/lib/replication/repl_dev/raft_repl_dev.h | 8 +++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 5 +++++ src/tests/test_common/raft_repl_test_base.hpp | 2 +- src/tests/test_solo_repl_dev.cpp | 2 +- 9 files changed, 29 insertions(+), 8 deletions(-) diff --git a/conanfile.py b/conanfile.py index b9500eed9..1751a307a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.8.0" + version = "6.9.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 832a446b5..872738afd 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -336,7 +336,7 @@ class ReplDevListener { /// @return Expected to return blk_alloc_hints for this write. If the hints are not available, then return the /// error. It is to be noted this method should return error only in very abnornal cases as in some code flow, an /// error would result in a crash or stall of the entire commit thread. - virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) = 0; + virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0; /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources. /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called @@ -458,6 +458,9 @@ class ReplDev { /// @return group_id virtual group_id_t group_id() const = 0; + /// @brief Sets a custom name for the repldev. Users can assign a meaningful name to the repldev for easy debugging. + virtual void set_custom_rdev_name(std::string const& name) = 0; + /// @brief Gets the block size with which IO will happen on this device /// @return Block size virtual uint32_t get_blk_size() const = 0; diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 5d0f262f0..8cea3cc5a 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -103,7 +103,7 @@ void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_h ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& listener, uint32_t data_size) { DEBUG_ASSERT(has_linked_data(), "Trying to allocate a block for non-inlined block"); - auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size); + auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size, repl_req_ptr_t(this)); if (hints_result.hasError()) { return hints_result.error(); } if (hints_result.value().committed_blk_id.has_value()) { diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index cf8f53759..43bbb7cbf 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -58,6 +58,7 @@ struct repl_journal_entry { struct repl_dev_superblk { static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D; static constexpr uint32_t REPL_DEV_SB_VERSION = 1; + static constexpr size_t max_name_len = 64; uint64_t magic{REPL_DEV_SB_MAGIC}; uint32_t version{REPL_DEV_SB_VERSION}; @@ -68,9 +69,14 @@ struct repl_dev_superblk { repl_lsn_t checkpoint_lsn; // LSN upto which this replica have checkpointed the Data repl_lsn_t compact_lsn; // maximum LSN that can be compacted to uint64_t group_ordinal; // Ordinal number which will be used to indicate the rdevXYZ for debugging + char rdev_name[max_name_len]; // Short name for the group for easy debugging uint64_t get_magic() const { return magic; } uint32_t get_version() const { return version; } + void set_rdev_name(std::string const& name) { + std::strncpy(rdev_name, name.c_str(), max_name_len - 1); + rdev_name[max_name_len - 1] = '\0'; + } }; #pragma pack() diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f3e907963..ddc8851f2 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -48,8 +48,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_last_flushed_commit_lsn = m_commit_upto_lsn; m_compact_lsn = m_rd_sb->compact_lsn; - m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); - + m_rdev_name = m_rd_sb->rdev_name; // Its ok not to do compare exchange, because loading is always single threaded as of now if (m_rd_sb->group_ordinal >= s_next_group_ordinal.load()) { s_next_group_ordinal.store(m_rd_sb->group_ordinal + 1); @@ -72,6 +71,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->last_snapshot_lsn = 0; m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1); m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); + m_rd_sb->set_rdev_name(m_rdev_name); if (m_rd_sb->is_timeline_consistent) { m_free_blks_journal = logstore_service().create_new_log_store(m_rd_sb->logdev_id, false /* append_mode */); @@ -80,6 +80,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb.write(); bind_data_service(); } + m_identify_str = m_rdev_name + ":" + group_id_str(); RD_LOGI(NO_TRACE_ID, diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index b571bc7ce..666105eac 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -216,8 +216,14 @@ class RaftReplDev : public ReplDev, std::vector< peer_info > get_replication_status() const override; std::set< replica_id_t > get_active_peers() const; group_id_t group_id() const override { return m_group_id; } + void set_custom_rdev_name(std::string const& name) override { + RD_LOGI(NO_TRACE_ID, "Resetting repl dev name from {} to {}", m_rdev_name, name); + m_rdev_name = name; + m_identify_str = name + ":" + group_id_str(); + m_rd_sb->set_rdev_name(m_rdev_name); + } std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } - std::string rdev_name() const { return m_rdev_name; }; + std::string rdev_name() const { return m_rd_sb->rdev_name; }; std::string identify_str() const { return m_identify_str; }; std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } uint32_t get_blk_size() const override; diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index b1708d5d4..9bd3040f1 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -62,6 +62,11 @@ class SoloReplDev : public ReplDev { uuid_t group_id() const override { return m_group_id; } + void set_custom_rdev_name(std::string const& name) override { + std::strncpy(m_rd_sb->rdev_name, name.c_str(), m_rd_sb->max_name_len - 1); + m_rd_sb->rdev_name[m_rd_sb->max_name_len - 1] = '\0'; + } + repl_lsn_t get_last_commit_lsn() const override { return 0; } uint32_t get_blk_size() const override; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 8fe72ac1d..6b8fb4c35 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -318,7 +318,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { void free_user_snp_ctx(void*& user_snp_ctx) override {} - ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { auto jheader = r_cast(header.cbytes()); Key k{.id_ = jheader->key_id}; auto iter = inmem_db_.find(k); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 3865cd2f3..861e12e5e 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -125,7 +125,7 @@ class SoloReplDevTest : public testing::Test { void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override {} - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { return blk_alloc_hints{}; } From cc7aa0b005624d90d5b06b9fc8a47c8e6c36dd43 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 8 Apr 2025 11:41:37 +0800 Subject: [PATCH 090/130] add grpc message size as config --- conanfile.py | 2 +- src/include/homestore/homestore_decl.hpp | 2 ++ src/lib/common/homestore_config.fbs | 3 +++ src/lib/homestore.cpp | 9 +++++++++ .../replication/service/raft_repl_service.cpp | 4 +++- src/tests/test_common/raft_repl_test_base.hpp | 8 +++++--- src/tests/test_raft_repl_dev.cpp | 16 ++++++++++++++++ 7 files changed, 39 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index 1751a307a..90805c711 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.0" + version = "6.9.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index db5bfd24c..859b4c59c 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -170,6 +170,8 @@ struct hs_input_params { uint64_t app_mem_size{static_cast< uint64_t >(1024) * static_cast< uint64_t >(1024) * static_cast< uint64_t >(1024)}; // memory available for the app (including cache) uint64_t hugepage_size{0}; // memory available for the hugepage + int max_data_size{0}; // max data size in byte on the data plane + int max_snapshot_batch_size{0}; // max snapshot batch size in byte for the raft state machine bool is_read_only{false}; // Is read only bool auto_recovery{true}; // Recovery of data is automatic or controlled by the caller diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 984a471e2..33f0ae77e 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -255,6 +255,9 @@ table Consensus { // Max append batch size max_append_batch_size: int32 = 64; + // Max grpc message size + max_grpc_message_size: int32 = 67108864; + // Threshold of log gap from leader to consider a replica as stale stale_log_gap_hi_threshold: int32 = 200; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 85ca4aa9b..3f8ee3737 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -164,6 +164,15 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ HomeStoreDynamicConfig::init_settings_default(); + // Check if the max_grpc_message_size is large enough to hold the data and snapshot batch size + if (HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_data_size || + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size) { + LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {} and max_snapshot_batch_size {}", + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, + input.max_snapshot_batch_size); + throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes"); + } + #ifdef _PRERELEASE // Start a default crash simulator which raises SIGKILL, in case user has not provided with_crash_simulator() // callback diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 2b355cebd..abfe84fda 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -79,7 +79,9 @@ void RaftReplService::start() { .ssl_key_ = ioenvironment.get_ssl_key(), .ssl_cert_ = ioenvironment.get_ssl_cert(), .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()), - .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client())}; + .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client()), + .max_receive_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), + .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size)}; m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */); LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 6b8fb4c35..47778d9a8 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -566,7 +566,8 @@ class RaftReplDevTestBase : public testing::Test { } while (true); } - void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr, + uint64_t* data_size = nullptr) { if (dbs_[0]->repl_dev() == nullptr) return; do { @@ -587,9 +588,10 @@ class RaftReplDevTestBase : public testing::Test { g_helper->runner().set_num_tasks(num_entries); LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); - g_helper->runner().set_task([this, block_size, db]() { + g_helper->runner().set_task([this, block_size, db, data_size]() { static std::normal_distribution<> num_blks_gen{3.0, 2.0}; - this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db); + uint64_t size = data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size; + this->generate_writes(size, block_size, db); }); if (wait_for_commit) { g_helper->runner().execute().get(); } break; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index cdcfa9b1e..7f7345e10 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -451,6 +451,22 @@ TEST_F(RaftReplDevTest, BaselineTest) { LOGINFO("BaselineTest done"); } +TEST_F(RaftReplDevTest, LargeDataWrite) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + // TODO: Increase the data size (e.g., to 16MB) for testing. + // For now, use 4MB to ensure the test passes since there are issues with larger IO sizes on the uring drive. + uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >(); + uint64_t data_size = 4 * 1024 * 1024; + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */, nullptr, &data_size); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; From c0ad259a795e95889088048ff67decaf2f01b9d6 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Wed, 9 Apr 2025 14:51:43 +0800 Subject: [PATCH 091/130] add metric for blk usage --- conanfile.py | 2 +- src/lib/device/chunk.cpp | 4 ++++ src/lib/device/chunk.h | 3 +++ src/lib/replication/repl_dev/raft_repl_dev.cpp | 15 +++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 9 +++++++++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 90805c711..dc657317d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.1" + version = "6.9.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/chunk.cpp b/src/lib/device/chunk.cpp index 9eb8563de..4962be386 100644 --- a/src/lib/device/chunk.cpp +++ b/src/lib/device/chunk.cpp @@ -29,6 +29,10 @@ std::string Chunk::to_string() const { vdev_ordinal(), stream_id()); } +float Chunk::get_blk_usage() const { + return s_cast(m_blk_allocator->get_used_blks()) / s_cast(m_blk_allocator->get_total_blks()); +} + void Chunk::set_user_private(const sisl::blob& data) { std::unique_lock lg{m_mgmt_mutex}; m_chunk_info.set_user_private(data); diff --git a/src/lib/device/chunk.h b/src/lib/device/chunk.h index 77b275e4b..b9d84abdb 100644 --- a/src/lib/device/chunk.h +++ b/src/lib/device/chunk.h @@ -27,6 +27,7 @@ class Chunk { const uint32_t m_stream_id; uint32_t m_vdev_ordinal{0}; shared< BlkAllocator > m_blk_allocator; + float blk_usage_report_threshold{0.9}; public: static constexpr auto MAX_CHUNK_SIZE = std::numeric_limits< uint32_t >::max(); @@ -66,6 +67,8 @@ class Chunk { nlohmann::json get_status([[maybe_unused]] int log_level) const; const BlkAllocator* blk_allocator() const { return m_blk_allocator.get(); } BlkAllocator* blk_allocator_mutable() { return m_blk_allocator.get(); } + float get_blk_usage_report_threshold() const { return blk_usage_report_threshold; } + float get_blk_usage() const; ////////////// Setters ///////////////////// void set_user_private(const sisl::blob& data); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index ddc8851f2..acecb0e49 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -18,6 +18,7 @@ // #include "common/homestore_flip.hpp" #include "replication/service/raft_repl_service.h" #include "replication/repl_dev/raft_repl_dev.h" +#include "device/chunk.h" #include "device/device.h" #include "push_data_rpc_generated.h" #include "fetch_data_rpc_generated.h" @@ -1385,6 +1386,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); return nuraft::cb_func::ReturnCode::ReturnNull; } + report_blk_metrics_if_needed(req); reqs->emplace_back(std::move(req)); } @@ -1708,4 +1710,17 @@ bool RaftReplDev::is_resync_mode() { return resync_mode; } +void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { + auto chunk_id = rreq->local_blkid().chunk_num(); + auto chunk = hs()->device_mgr()->get_chunk(chunk_id); + if (chunk->get_blk_usage() >= chunk->get_blk_usage_report_threshold()) { + auto local_blk_num = rreq->local_blkid().blk_num(); + auto remote_blk_num = rreq->remote_blkid().blkid.blk_num(); + // Focus only on cases where the locally allocated blocks exceed the proposer's allocated blocks, + // as this indicates that the member might encounter NO_SPACE_LEFT before the proposer. + auto blk_diff_with_remote = local_blk_num > remote_blk_num ? local_blk_num - remote_blk_num : 0; + HISTOGRAM_OBSERVE(m_metrics, blk_diff_with_proposer, blk_diff_with_remote); + } +} + } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 666105eac..aedcd8475 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -90,6 +90,13 @@ class RaftReplDevMetrics : public sisl::MetricsGroup { REGISTER_HISTOGRAM(rreq_pieces_per_write, "Number of individual pieces per write", HistogramBucketsType(LinearUpto64Buckets)); + // In the identical layout chunk, the blk num of the follower and leader is expected to be the same. + // However, due to the concurrency between the data channel and the raft channel, there might be some + // allocation differences on the same lsn. When a leader switch occurs, these differences could become garbage. + // This metric can partially reflect the potential amount of garbage. + REGISTER_HISTOGRAM(blk_diff_with_proposer, + "allocated blk num diff on the same lsn with proposer when chunk usage >= 0.9"); + // Raft channel metrics REGISTER_HISTOGRAM(raft_end_of_append_batch_latency_us, "Raft end_of_append_batch latency in us", "raft_logstore_append_latency", {"op", "end_of_append_batch"}); @@ -389,6 +396,8 @@ class RaftReplDev : public ReplDev, void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id); void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); + + void report_blk_metrics_if_needed(repl_req_ptr_t rreq); }; } // namespace homestore From 347c9cc043b2c53f611a3f129b0894ecf63d2114 Mon Sep 17 00:00:00 2001 From: Sanal Date: Thu, 10 Apr 2025 02:41:21 +0530 Subject: [PATCH 092/130] Fix solo repl dev log flush and graceful shutdown. Add flush mode to logdev as nublocks uses timer, nuobject uses explicit log flush mode. Flush mode has to be stored in superblk to support recovery. Enable solo repl dev UT. Add graceful shutdown for UT to work. --- conanfile.py | 2 +- .../homestore/logstore/log_store_internal.hpp | 7 +++ src/include/homestore/logstore_service.hpp | 6 +-- src/lib/logstore/log_dev.cpp | 40 ++++++++++++---- src/lib/logstore/log_dev.hpp | 10 ++-- src/lib/logstore/log_store_service.cpp | 17 ++++--- .../log_store/home_raft_log_store.cpp | 8 ++-- .../replication/repl_dev/solo_repl_dev.cpp | 26 +++++++++-- src/lib/replication/repl_dev/solo_repl_dev.h | 2 +- .../replication/service/generic_repl_svc.cpp | 21 ++++++++- src/tests/CMakeLists.txt | 2 +- src/tests/log_store_benchmark.cpp | 2 +- src/tests/test_log_dev.cpp | 46 +++++++++---------- src/tests/test_log_store.cpp | 6 +-- src/tests/test_log_store_long_run.cpp | 6 +-- src/tests/test_solo_repl_dev.cpp | 20 ++++---- 16 files changed, 143 insertions(+), 78 deletions(-) diff --git a/conanfile.py b/conanfile.py index dc657317d..13e1a573b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.2" + version = "6.9.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/logstore/log_store_internal.hpp b/src/include/homestore/logstore/log_store_internal.hpp index 9b7019cfb..7768086ee 100644 --- a/src/include/homestore/logstore/log_store_internal.hpp +++ b/src/include/homestore/logstore/log_store_internal.hpp @@ -52,6 +52,12 @@ typedef std::function< void(std::shared_ptr< HomeLogStore >, logstore_seq_num_t) typedef int64_t logid_t; +VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together) + INLINE = 1 << 0, // Allow flush inline with the append + TIMER = 1 << 1, // Allow timer based automatic flush + EXPLICIT = 1 << 2, // Allow explcitly user calling flush +); + struct logdev_key { logid_t idx; off_t dev_offset; @@ -172,4 +178,5 @@ struct logstore_superblk { logstore_seq_num_t m_first_seq_num{0}; }; #pragma pack() + } // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index 18c1e75e3..039e14114 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -93,7 +93,7 @@ class LogStoreService { * chunks. Logdev can start with zero chunks and dynamically add chunks based on write request. * @return Newly created log dev id. */ - logdev_id_t create_new_logdev(); + logdev_id_t create_new_logdev(flush_mode_t flush_mode); /** * @brief Open a log dev. @@ -101,7 +101,7 @@ class LogStoreService { * @param logdev_id: Logdev ID * @return Newly created log dev id. */ - void open_logdev(logdev_id_t logdev_id); + void open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode); /** * @brief Destroy a log dev. @@ -177,7 +177,7 @@ class LogStoreService { void delete_unopened_logdevs(); private: - std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id); + std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode); void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); logdev_id_t get_next_logdev_id(); void logdev_super_blk_found(const sisl::byte_view& buf, void* meta_cookie); diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index d932d68c5..cf4309e00 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -65,7 +65,7 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { // First read the info block if (format) { HS_LOG_ASSERT(m_logdev_meta.is_empty(), "Expected meta to be not present"); - m_logdev_meta.create(m_logdev_id); + m_logdev_meta.create(m_logdev_id, m_flush_mode); m_vdev_jd->update_data_start_offset(0); } else { HS_LOG_ASSERT(!m_logdev_meta.is_empty(), "Expected meta data to be read already before loading"); @@ -145,9 +145,30 @@ void LogDev::stop() { m_hs.reset(); } -bool LogDev::is_stopped() { - std::unique_lock lg = flush_guard(); - return m_stopped; +void LogDev::stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + { + std::unique_lock lg = flush_guard(); + // waiting under lock to make sure no new flush is started + while (m_pending_callback.load() > 0) { + THIS_LOGDEV_LOG(INFO, "Waiting for pending callbacks to complete, pending callbacks {}", + m_pending_callback.load()); + std::this_thread::sleep_for(std::chrono::milliseconds{1000}); + } + } + + folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); + for (auto& [_, store] : m_id_logstore_map) + store.log_store->stop(); + + // after we call stop, we need to do any pending device truncations + truncate(); + m_id_logstore_map.clear(); + if (allow_timer_flush()) stop_timer(); } void LogDev::destroy() { @@ -504,7 +525,7 @@ void LogDev::on_flush_completion(LogGroup* lg) { // since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion for (auto const& [idx, req] : req_map) { m_pending_callback++; - iomanager.run_on_forget(iomgr::reactor_regex::random_worker, iomgr::fiber_regex::syncio_only, + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, /* iomgr::fiber_regex::syncio_only, */ [this, dev_offset, idx, req]() { auto ld_key = logdev_key{idx, dev_offset}; auto comp_cb = req->log_store->get_comp_cb(); @@ -544,11 +565,13 @@ uint64_t LogDev::truncate() { // Persist the logstore superblock to ensure correct start LSN during recovery. Avoid such scenario: // 1. Follower1 appends logs up to 100, then is stopped by a sigkill. // 2. Upon restart, a baseline resync is triggered using snapshot 2000. - // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a valid + // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a + // valid // device offset for LSN 2000 to update it. // 4. Follower1 appends logs from 2001 to 2500, making tail_lsn > 2000. // 5. Get m_trunc_ld_key={0,0}, goto here and return 0 without persist. - // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as [1,2500]. + // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as + // [1,2500]. m_logdev_meta.persist(); return 0; } @@ -750,7 +773,7 @@ nlohmann::json LogDev::get_status(int verbosity) const { /////////////////////////////// LogDevMetadata Section /////////////////////////////////////// LogDevMetadata::LogDevMetadata() : m_sb{logdev_sb_meta_name}, m_rollback_sb{logdev_rollback_sb_meta_name} {} -logdev_superblk* LogDevMetadata::create(logdev_id_t id) { +logdev_superblk* LogDevMetadata::create(logdev_id_t id, flush_mode_t flush_mode) { logdev_superblk* sb = m_sb.create(logdev_sb_size_needed(0)); rollback_superblk* rsb = m_rollback_sb.create(rollback_superblk::size_needed(1)); @@ -759,6 +782,7 @@ logdev_superblk* LogDevMetadata::create(logdev_id_t id) { m_id_reserver = std::make_unique< sisl::IDReserver >(); m_sb->logdev_id = id; + m_sb->flush_mode = flush_mode; m_sb.write(); m_rollback_sb->logdev_id = id; diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 5a8fafc2c..43428d07e 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -404,6 +404,8 @@ struct logdev_superblk { uint32_t num_stores{0}; uint64_t start_dev_offset{0}; logid_t key_idx{0}; + flush_mode_t flush_mode; + // The meta data starts immediately after the super block // Equivalent of: // logstore_superblk meta[0]; @@ -481,7 +483,7 @@ class LogDevMetadata { LogDevMetadata& operator=(LogDevMetadata&&) noexcept = delete; ~LogDevMetadata() = default; - logdev_superblk* create(logdev_id_t id); + logdev_superblk* create(logdev_id_t id, flush_mode_t); void reset(); std::vector< std::pair< logstore_id_t, logstore_superblk > > load(); void persist(); @@ -572,12 +574,6 @@ struct logstore_info { static std::string const logdev_sb_meta_name{"Logdev_sb"}; static std::string const logdev_rollback_sb_meta_name{"Logdev_rollback_sb"}; -VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together) - INLINE = 1 << 0, // Allow flush inline with the append - TIMER = 1 << 1, // Allow timer based automatic flush - EXPLICIT = 1 << 2, // Allow explcitly user calling flush -); - class LogDev : public std::enable_shared_from_this< LogDev > { friend class HomeLogStore; diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index 8d62bdf05..fd1f8df6a 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -135,10 +135,12 @@ logdev_id_t LogStoreService::get_next_logdev_id() { return id; } -logdev_id_t LogStoreService::create_new_logdev() { +logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { + if (is_stopping()) return 0; + incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); logdev_id_t logdev_id = get_next_logdev_id(); - auto logdev = create_new_logdev_internal(logdev_id); + auto logdev = create_new_logdev_internal(logdev_id, flush_mode); logdev->start(true /* format */, m_logdev_vdev); COUNTER_INCREMENT(m_metrics, logdevs_count, 1); HS_LOG(INFO, logstore, "Created log_dev={}", logdev_id); @@ -179,19 +181,19 @@ void LogStoreService::delete_unopened_logdevs() { m_unopened_logdev.clear(); } -std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id) { - auto logdev = std::make_shared< LogDev >(logdev_id); +std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode) { + auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it == m_id_logdev_map.end()), "logdev id {} already exists", logdev_id); m_id_logdev_map.insert(std::make_pair<>(logdev_id, logdev)); return logdev; } -void LogStoreService::open_logdev(logdev_id_t logdev_id) { +void LogStoreService::open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode) { folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { - auto logdev = std::make_shared< LogDev >(logdev_id); + auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode); m_id_logdev_map.emplace(logdev_id, logdev); LOGDEBUGMOD(logstore, "log_dev={} does not exist, created!", logdev_id); } @@ -224,13 +226,14 @@ void LogStoreService::logdev_super_blk_found(const sisl::byte_view& buf, void* m folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); std::shared_ptr< LogDev > logdev; auto id = sb->logdev_id; + auto flush_mode = sb->flush_mode; const auto it = m_id_logdev_map.find(id); // We could update the logdev map either with logdev or rollback superblks found callbacks. if (it != m_id_logdev_map.end()) { logdev = it->second; HS_LOG(DEBUG, logstore, "Log dev superblk found log_dev={}", id); } else { - logdev = std::make_shared< LogDev >(id); + logdev = std::make_shared< LogDev >(id, flush_mode); m_id_logdev_map.emplace(id, logdev); // when recover logdev meta blk, we get all the logdevs from the superblk. we put them in m_unopened_logdev // too. after logdev meta blks are all recovered, when a client opens a logdev, we remove it from diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index e44b94463..37ef04bee 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -90,7 +90,7 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore m_dummy_log_entry = nuraft::cs_new< nuraft::log_entry >(0, nuraft::buffer::alloc(0), nuraft::log_val_type::app_log); if (logstore_id == UINT32_MAX) { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store = logstore_service().create_new_log_store(m_logdev_id, true); if (!m_log_store) { throw std::runtime_error("Failed to create log store"); } m_logstore_id = m_log_store->get_store_id(); @@ -99,7 +99,7 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore m_logdev_id = logdev_id; m_logstore_id = logstore_id; LOGDEBUGMOD(replication, "Opening existing home log_dev={} log_store={}", m_logdev_id, logstore_id); - logstore_service().open_logdev(m_logdev_id); + logstore_service().open_logdev(m_logdev_id, flush_mode_t::EXPLICIT); m_log_store_future = logstore_service() .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb) .thenValue([this](auto log_store) { @@ -380,8 +380,8 @@ ulong HomeRaftLogStore::last_durable_index() { void HomeRaftLogStore::purge_all_logs() { auto last_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); - REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", - m_logstore_id, m_logdev_id, last_lsn); + REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", m_logstore_id, + m_logdev_id, last_lsn); m_log_store->truncate(last_lsn, false /* in_memory_truncate_only */); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 93eef117c..58aa69a96 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -10,7 +10,7 @@ namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { - logstore_service().open_logdev(m_rd_sb->logdev_id); + logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER); logstore_service() .open_log_store(m_rd_sb->logdev_id, m_rd_sb->logstore_id, true /* append_mode */) .thenValue([this](auto log_store) { @@ -19,7 +19,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3)); }); } else { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER); m_data_journal = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_rd_sb->logdev_id = m_logdev_id; @@ -30,6 +30,8 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } + + incr_pending_request_num(); auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, value.size, m_listener); @@ -58,6 +60,7 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { data_service().commit_blk(rreq->local_blkid()); m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + decr_pending_request_num(); }); } @@ -66,7 +69,6 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx uint32_t remain_size = buf.size() - sizeof(repl_journal_entry); HS_REL_ASSERT_EQ(entry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry found"); - HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_DATA_LINKED, "Found a journal entry which is not data"); uint8_t const* raw_ptr = r_cast< uint8_t const* >(entry) + sizeof(repl_journal_entry); sisl::blob header{raw_ptr, entry->user_header_size}; @@ -93,11 +95,25 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch, trace_id_t tid) { - return data_service().async_read(bid, sgs, size, part_of_batch); + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } + incr_pending_request_num(); + auto result = data_service().async_read(bid, sgs, size, part_of_batch); + decr_pending_request_num(); + return result; } folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { - return data_service().async_free_blk(bid); + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } + incr_pending_request_num(); + auto result = data_service().async_free_blk(bid); + decr_pending_request_num(); + return result; } uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size(); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 9bd3040f1..0a06c7203 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -56,7 +56,7 @@ class SoloReplDev : public ReplDev { bool is_ready_for_traffic() const override { return true; } void purge() override {} - std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { + std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return nullptr; } diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 067043185..c1263b7fb 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -80,6 +80,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} +SoloReplService::~SoloReplService() {}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -98,8 +99,23 @@ void SoloReplService::start() { } void SoloReplService::stop() { - GenericReplService::stop(); + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + + // stop all repl_devs + { + std::unique_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< SoloReplDev >(it->second); + rdev->stop(); + } + } hs()->logstore_service().stop(); + hs()->data_service().stop(); } AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, @@ -112,6 +128,7 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t auto listener = m_repl_app->create_repl_dev_listener(group_id); listener->set_repl_dev(rdev); rdev->attach_listener(std::move(listener)); + incr_pending_request_num(); { std::unique_lock lg(m_rd_map_mtx); @@ -119,10 +136,12 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t if (!happened) { // We should never reach here, as we have failed to emplace in map, but couldn't find entry DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); + decr_pending_request_num(); return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); } } + decr_pending_request_num(); return make_async_success< shared< ReplDev > >(rdev); } diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 6332fd294..95bb695ad 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -127,7 +127,7 @@ if (${io_tests}) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) - # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) + add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() can_build_spdk_io_tests(spdk_tests) diff --git a/src/tests/log_store_benchmark.cpp b/src/tests/log_store_benchmark.cpp index a80f67b45..c34db76a3 100644 --- a/src/tests/log_store_benchmark.cpp +++ b/src/tests/log_store_benchmark.cpp @@ -55,7 +55,7 @@ class BenchLogStore { public: friend class SampleDB; BenchLogStore() { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_log_store->register_log_found_cb(bind_this(BenchLogStore::on_log_found, 3)); m_nth_entry.store(0); diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index 753bb63c9..871eafdaf 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -157,9 +157,10 @@ class LogDevTest : public ::testing::Test { } } - void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, uint32_t fixed_size = 0) { + void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, + uint32_t fixed_size = 0) { bool io_memory{false}; - std::vector data_vector; + std::vector< test_log_data* > data_vector; for (int64_t i = 0; i < batch; ++i) { auto* d = prepare_data(lsn + i, io_memory, fixed_size); @@ -245,20 +246,16 @@ class LogDevTest : public ::testing::Test { logid_t get_last_truncate_idx(logdev_id_t logdev_id) { auto status = logstore_service().get_logdev(logdev_id)->get_status(0); - if (status.contains("last_truncate_log_idx")) { - return s_cast(status["last_truncate_log_idx"]); - } + if (status.contains("last_truncate_log_idx")) { return s_cast< logid_t >(status["last_truncate_log_idx"]); } LOGERROR("Failed to get last_truncate_log_idx from logdev status for logdev_id {}", logdev_id); - return static_cast(-1); + return static_cast< logid_t >(-1); } logid_t get_current_log_idx(logdev_id_t logdev_id) { auto status = logstore_service().get_logdev(logdev_id)->get_status(0); - if (status.contains("current_log_idx")) { - return s_cast(status["current_log_idx"]); - } + if (status.contains("current_log_idx")) { return s_cast< logid_t >(status["current_log_idx"]); } LOGERROR("Failed to get current_log_idx from logdev status for logdev_id {}", logdev_id); - return static_cast(-1); + return static_cast< logid_t >(-1); } }; @@ -267,7 +264,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); const auto store_id = log_store->get_store_id(); @@ -287,7 +284,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) { TEST_F(LogDevTest, Rollback) { LOGINFO("Step 1: Create a single logstore to start rollback test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); auto store_id = log_store->get_store_id(); @@ -295,7 +292,7 @@ TEST_F(LogDevTest, Rollback) { auto restart = [&]() { std::promise< bool > p; auto starting_cb = [&]() { - logstore_service().open_logdev(logdev_id); + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { log_store = store; p.set_value(true); @@ -354,7 +351,7 @@ TEST_F(LogDevTest, Rollback) { TEST_F(LogDevTest, ReTruncate) { LOGINFO("Step 1: Create a single logstore to start re-truncate test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); @@ -381,7 +378,7 @@ TEST_F(LogDevTest, ReTruncate) { TEST_F(LogDevTest, TruncateWithExceedingLSN) { LOGINFO("Step 1: Create a single logstore to start truncate with exceeding LSN test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); @@ -425,7 +422,7 @@ TEST_F(LogDevTest, TruncateWithExceedingLSN) { TEST_F(LogDevTest, TruncateAfterRestart) { LOGINFO("Step 1: Create a single logstore to start truncate with overlapping LSN test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); auto store_id = log_store->get_store_id(); @@ -433,7 +430,7 @@ TEST_F(LogDevTest, TruncateAfterRestart) { auto restart = [&]() { std::promise< bool > p; auto starting_cb = [&]() { - logstore_service().open_logdev(logdev_id); + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { log_store = store; p.set_value(true); @@ -476,13 +473,12 @@ TEST_F(LogDevTest, TruncateAfterRestart) { TEST_F(LogDevTest, TruncateAcrossMultipleStores) { LOGINFO("Step 1: Create 3 log stores to start truncate across multiple stores test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto store1 = logstore_service().create_new_log_store(logdev_id, false); auto store2 = logstore_service().create_new_log_store(logdev_id, false); auto store3 = logstore_service().create_new_log_store(logdev_id, false); - LOGINFO("Step 2: Insert 100 entries to store {}", store1->get_store_id()); logstore_seq_num_t cur_lsn = 0; kickstart_inserts(store1, cur_lsn, 100); @@ -643,15 +639,15 @@ TEST_F(LogDevTest, TruncateAcrossMultipleStores) { TEST_F(LogDevTest, TruncateLogsAfterFlushAndRestart) { LOGINFO("Step 1: Create a single logstore to start truncate-logs-after-flush-and-restart test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); auto store_id = log_store->get_store_id(); auto restart = [&]() { - std::promise < bool > p; + std::promise< bool > p; auto starting_cb = [&]() { - logstore_service().open_logdev(logdev_id); + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { log_store = store; p.set_value(true); @@ -711,7 +707,7 @@ TEST_F(LogDevTest, CreateRemoveLogDev) { ASSERT_EQ(vdev->num_descriptors(), 0); for (uint32_t i{0}; i < num_logdev; ++i) { - auto id = logstore_service().create_new_logdev(); + auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple(); auto store = logstore_service().create_new_log_store(id, false); log_stores.push_back(store); @@ -759,7 +755,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) { // Test deletion of unopened logdev. std::set< logdev_id_t > id_set, unopened_id_set; for (uint32_t i{0}; i < num_logdev; ++i) { - auto id = logstore_service().create_new_logdev(); + auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); id_set.insert(id); if (i >= num_logdev / 2) { unopened_id_set.insert(id); } s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple(); @@ -783,7 +779,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) { auto starting_cb = [&]() { auto it = id_set.begin(); for (uint32_t i{0}; i < id_set.size() / 2; i++, it++) { - logstore_service().open_logdev(*it); + logstore_service().open_logdev(*it, flush_mode_t::EXPLICIT); } }; start_homestore(true /* restart */, starting_cb); diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index 43e57ff7c..1aa580bba 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -455,7 +455,7 @@ class SampleDB { for (uint32_t i{0}; i < n_log_stores; ++i) { SampleLogStoreClient* client = m_log_store_clients[i].get(); - logstore_service().open_logdev(client->m_logdev_id); + logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT); logstore_service() .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */) .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); }); @@ -479,7 +479,7 @@ class SampleDB { std::vector< logdev_id_t > logdev_id_vec; for (uint32_t i{0}; i < n_log_devs; ++i) { - logdev_id_vec.push_back(logstore_service().create_new_logdev()); + logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT)); } for (uint32_t i{0}; i < n_log_stores; ++i) { @@ -1225,7 +1225,7 @@ TEST_F(LogStoreTest, WriteSyncThenRead) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); auto tmp_log_store = logstore_service().create_new_log_store(logdev_id, false); const auto store_id = tmp_log_store->get_store_id(); LOGINFO("Created new log store -> id {}", store_id); diff --git a/src/tests/test_log_store_long_run.cpp b/src/tests/test_log_store_long_run.cpp index e9808da65..507e51633 100644 --- a/src/tests/test_log_store_long_run.cpp +++ b/src/tests/test_log_store_long_run.cpp @@ -294,7 +294,7 @@ class LogStoreLongRun : public ::testing::Test { HS_SETTINGS_FACTORY().save(); for (uint32_t i{0}; i < n_log_stores; ++i) { SampleLogStoreClient* client = m_log_store_clients[i].get(); - logstore_service().open_logdev(client->m_logdev_id); + logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT); logstore_service() .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */) .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); }); @@ -318,7 +318,7 @@ class LogStoreLongRun : public ::testing::Test { std::vector< logdev_id_t > logdev_id_vec; for (uint32_t i{0}; i < n_log_devs; ++i) - logdev_id_vec.push_back(logstore_service().create_new_logdev()); + logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT)); for (uint32_t i{0}; i < n_log_stores; ++i) m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( @@ -466,7 +466,7 @@ class LogStoreLongRun : public ::testing::Test { validate_num_stores(); // Create a new logstore. - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( logdev_id, bind_this(LogStoreLongRun::on_log_insert_completion, 3))); validate_num_stores(); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 861e12e5e..a192f54b1 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -98,6 +98,7 @@ class SoloReplDevTest : public testing::Test { void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("Received on_commit lsn={}", lsn); if (ctx == nullptr) { m_test.validate_replay(*repl_dev(), lsn, header, key, blkids); } else { @@ -231,8 +232,8 @@ class SoloReplDevTest : public testing::Test { uint32_t size = blkids.blk_count() * g_block_size; if (size) { auto read_sgs = HSTestHelper::create_sgs(size, size); - LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, - blkids.to_string()); + LOGINFO("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, + blkids.to_string()); rdev.async_read(blkids, read_sgs, size) .thenValue([this, hdr = *jhdr, read_sgs, lsn, blkids, &rdev](auto&& err) { RELEASE_ASSERT(!err, "Error during async_read"); @@ -242,8 +243,8 @@ class SoloReplDevTest : public testing::Test { HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); iomanager.iobuf_free(uintptr_cast(iov.iov_base)); } - LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully", - boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); + LOGINFO("[{}] Replay of lsn={} blkid={} validated successfully", + boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); m_task_waiter.one_complete(); }); } else { @@ -257,15 +258,15 @@ class SoloReplDevTest : public testing::Test { req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size); auto const cap = hs()->repl_service().get_cap_stats(); - LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + LOGINFO("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); rdev.async_read(req->written_blkids, req->read_sgs, req->read_sgs.size) .thenValue([this, &rdev, req](auto&& err) { RELEASE_ASSERT(!err, "Error during async_read"); - LOGDEBUG("[{}] Write complete with lsn={} for size={} blkids={}", - boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size, - req->written_blkids.to_string()); + LOGINFO("[{}] Write complete with lsn={} for size={} blkids={}", + boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size, + req->written_blkids.to_string()); auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size, "journal hdr data size mismatch with actual size"); @@ -297,7 +298,9 @@ TEST_F(SoloReplDevTest, TestRandomSizedDataBlock) { uint32_t key_size = rand() % 512 + 8; this->write_io(key_size, nblks * g_block_size, g_block_size); }); + this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); this->m_task_waiter.start([this]() { this->restart(); }).get(); } @@ -305,6 +308,7 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) { LOGINFO("Step 1: run on worker threads to schedule write"); this->m_io_runner.set_task([this]() { this->write_io(0u, 0u, g_block_size); }); this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); this->m_task_waiter.start([this]() { this->restart(); }).get(); } From fc768ebd08d540c7f4e641df37b2b198fbea6203 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Thu, 17 Apr 2025 15:41:53 +0800 Subject: [PATCH 093/130] Support handling no_space_left error in raft channel (#682) --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 1 + src/include/homestore/replication/repl_dev.h | 40 ++- src/lib/blkalloc/append_blk_allocator.cpp | 7 +- .../replication/log_store/repl_log_store.cpp | 6 +- src/lib/replication/push_data_rpc.fbs | 2 +- src/lib/replication/repl_dev/common.cpp | 36 ++- .../replication/repl_dev/raft_repl_dev.cpp | 233 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 83 ++++--- .../repl_dev/raft_state_machine.cpp | 23 +- .../replication/repl_dev/raft_state_machine.h | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 11 + src/tests/test_raft_repl_dev.cpp | 33 ++- 13 files changed, 363 insertions(+), 116 deletions(-) diff --git a/conanfile.py b/conanfile.py index 13e1a573b..e95a96deb 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.3" + version = "6.9.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 1cada6c35..0602eecfa 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -31,6 +31,7 @@ VENUM(ReplServiceError, int32_t, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, DATA_DUPLICATED = -20002, + QUIENCE_STATE = -20003, FAILED = -32768); // clang-format on diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 872738afd..cfb24e48a 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -142,7 +142,6 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_journal_entry const* journal_entry() const { return m_journal_entry; } uint32_t journal_entry_size() const; bool is_localize_pending() const { return m_is_jentry_localize_pending; } - bool is_data_inlined() const { return (m_op_code == journal_type_t::HS_DATA_INLINED); } bool has_linked_data() const { return (m_op_code == journal_type_t::HS_DATA_LINKED); } raft_buf_ptr_t& raft_journal_buf(); @@ -336,7 +335,8 @@ class ReplDevListener { /// @return Expected to return blk_alloc_hints for this write. If the hints are not available, then return the /// error. It is to be noted this method should return error only in very abnornal cases as in some code flow, an /// error would result in a crash or stall of the entire commit thread. - virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0; + virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0; /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources. /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called @@ -383,12 +383,12 @@ class ReplDevListener { } /// @brief ask upper layer to handle no_space_left event - virtual folly::Future< std::error_code > on_no_space_left(uint32_t pdev_id, chunk_num_t chunk_id) { - return folly::makeFuture< std::error_code >(std::error_code{}); - } + // @param lsn - on which repl_lsn no_space_left happened + // @param chunk_id - on which chunk no_space_left happened + virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) { return; } /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id) {}; + virtual void on_log_replay_done(const group_id_t& group_id){}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -469,6 +469,10 @@ class ReplDev { /// @return last_commit_lsn virtual repl_lsn_t get_last_commit_lsn() const = 0; + /// @brief Gets the repl lsn of the last log in log store + /// @return last_append_repl_lsn + virtual repl_lsn_t get_last_append_lsn() = 0; + /// @brief if this replica is ready for accepting client IO. /// @return true if ready, false otherwise virtual bool is_ready_for_traffic() const = 0; @@ -489,6 +493,30 @@ class ReplDev { virtual shared< ReplDevListener > get_listener() { return m_listener; } + // we have no shutdown for repl_dev, since shutdown repl_dev is done by repl_service + void stop() { + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + + // pause/resume statemachine(commiting thread) + virtual void pause_statemachine() = 0; + virtual void resume_statemachine() = 0; + + // complete all the requests that are in progress and start refusing new reqs + virtual void quiesce_reqs() = 0; + + // start accepting new reqs + virtual void resume_accepting_reqs() = 0; + + // clear reqs that has allocated blks on the given chunk. + virtual void clear_chunk_req(chunk_num_t chunk_id) = 0; + protected: shared< ReplDevListener > m_listener; }; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 141d09279..2f6cec25c 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -73,12 +73,17 @@ BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hint } if (avail_blks < nblks) { // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved blks): {}", nblks, available_blks(), avail_blks); + LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved " + "blks): {}", + nblks, available_blks(), avail_blks); + // the caller can know in which chunk no_space_left happened; + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::FAILED; } diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index ca62c3197..f9b3d454e 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -16,9 +16,9 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { } repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); + RELEASE_ASSERT_NE(nullptr != rreq, "Failed to localize journal entry before appending log"); ulong lsn = HomeRaftLogStore::append(entry); m_sm.link_lsn_to_req(rreq, int64_cast(lsn)); - RD_LOGT(rreq->traceID(), "Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); return lsn; } @@ -31,6 +31,7 @@ void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry } repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); + RELEASE_ASSERT(nullptr != rreq, "Failed to localize journal entry before overwriting log at index {}", index); HomeRaftLogStore::write_at(index, entry); m_sm.link_lsn_to_req(rreq, int64_cast(index)); RD_LOGT(rreq->traceID(), "Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); @@ -66,7 +67,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { // Wait for the fetch and write to be completed successfully. // It is essential to complete the data write before appending to the log. If the logs are flushed // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost, - // it leaves us uncertain about whether the data was actually written, potentially leading to data inconsistency. + // it leaves us uncertain about whether the data was actually written, potentially leading to data + // inconsistency. std::move(fut).wait(); HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); } diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs index 279fefcb5..d9a981e7c 100644 --- a/src/lib/replication/push_data_rpc.fbs +++ b/src/lib/replication/push_data_rpc.fbs @@ -2,7 +2,7 @@ native_include "boost/uuid/uuid.hpp"; namespace homestore; table PushDataRequest { - traceID: uint64; // traceID for the REQ + trace_id: uint64; // traceID for the REQ issuer_replica_id : int32; // Replica id of the issuer raft_term : uint64; // Raft term number dsn : uint64; // Data Sequence number diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 8cea3cc5a..6a39256f9 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -10,8 +10,9 @@ namespace homestore { -ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener) { +ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener) { m_rkey = std::move(rkey); #ifndef NDEBUG if (data_size > 0) { @@ -26,17 +27,34 @@ ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool m_key = key; m_is_jentry_localize_pending = (!is_proposer && (data_size > 0)); // Pending on the applier and with linked data - // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two threads(data channel and raft channel) are trying to do the same - // thing. So take state mutex and allocate the blk + // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two + // threads(data channel and raft channel) are trying to do the same thing. So take state mutex and allocate the blk std::unique_lock< std::mutex > lg(m_state_mtx); if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { - auto alloc_status = alloc_local_blks(listener, data_size); + ReplServiceError alloc_status; +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("simulate_no_space_left") && !is_proposer) { + LOGERROR("Simulate no space left on follower for testing purposes"); + // TODO: support `simulate_no_space_left` for the leader, do not throw exception in on-error in the test + // framework, it will cause the leader to fail and exit. + alloc_status = ReplServiceError::NO_SPACE_LEFT; + } else { + alloc_status = alloc_local_blks(listener, data_size); + if (alloc_status != ReplServiceError::OK) { + LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, + alloc_status); + } + } +#else + alloc_status = alloc_local_blks(listener, data_size); if (alloc_status != ReplServiceError::OK) { LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, alloc_status); } +#endif return alloc_status; } + return ReplServiceError::OK; } @@ -107,8 +125,9 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list if (hints_result.hasError()) { return hints_result.error(); } if (hints_result.value().committed_blk_id.has_value()) { - //if the committed_blk_id is already present, use it and skip allocation and commitment - LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, rkey().to_string()); + // if the committed_blk_id is already present, use it and skip allocation and commitment + LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, + rkey().to_string()); m_local_blkid = hints_result.value().committed_blk_id.value(); add_state(repl_req_state_t::BLK_ALLOCATED); add_state(repl_req_state_t::DATA_RECEIVED); @@ -122,7 +141,8 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints_result.value(), m_local_blkid); if (status != BlkAllocStatus::SUCCESS) { - LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); + LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, + rkey(), status); DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index acecb0e49..99e61332d 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -15,7 +15,6 @@ #include "common/homestore_assert.hpp" #include "common/homestore_config.hpp" -// #include "common/homestore_flip.hpp" #include "replication/service/raft_repl_service.h" #include "replication/repl_dev/raft_repl_dev.h" #include "device/chunk.h" @@ -200,18 +199,27 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ members.replica_in = member_in; sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); - rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = trace_id}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); - - auto err = m_state_machine->propose_to_raft(std::move(rreq)); - if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, "Replace member propose to raft failed {}", err); + auto status = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + if (status != ReplServiceError::OK) { + // Failed to initialize the repl_req_ctx for replace member. + RD_LOGE(trace_id, "Failed to initialize repl_req_ctx for replace member, error={}", status); reset_quorum_size(0, trace_id); decr_pending_request_num(); - return make_async_error<>(std::move(err)); + return make_async_error<>(std::move(status)); + } + + status = m_state_machine->propose_to_raft(std::move(rreq)); + if (status != ReplServiceError::OK) { + RD_LOGE(trace_id, "Replace member propose to raft failed {}", status); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(status)); } RD_LOGI(trace_id, "Replace member proposed to raft group_id={}", group_id_str()); @@ -273,13 +281,20 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not // have a conflict with the old rreq. - rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = std::numeric_limits< uint64_t >::max()}, - journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); + auto err = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = std::numeric_limits< uint64_t >::max()}, + journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); + + if (err != ReplServiceError::OK) { + // Failed to initialize the repl_req_ctx for replace member. + LOGERROR("Failed to initialize repl_req_ctx for destorying group, error={}", err); + return folly::makeSemiFuture< ReplServiceError >(std::move(err)); + } - auto err = m_state_machine->propose_to_raft(std::move(rreq)); + err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::ACTIVE; }); return folly::makeSemiFuture< ReplServiceError >(std::move(err)); @@ -318,12 +333,16 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = tid}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, - true /* is_proposer */, header, key, data.size, m_listener); + auto status = init_req_ctx( + rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, + key, data.size, m_listener); + + if (status != ReplServiceError::OK) { + RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); + handle_error(rreq, status); + return; + } RD_LOGD(tid, "repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size [{}] bytes", rreq->rkey(), header.size(), key.size(), data.size); @@ -332,12 +351,6 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); - if (status != ReplServiceError::OK) { - RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); - handle_error(rreq, status); - return; - } - // If it is header only entry, directly propose to the raft if (rreq->has_linked_data()) { if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { @@ -456,7 +469,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn(), - .traceID = push_req->traceID()}; + .traceID = push_req->trace_id()}; auto const req_orig_time_ms = push_req->time_ms(); RD_LOGD(rkey.traceID, "Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); @@ -534,8 +547,10 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d } repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, - [[maybe_unused]] bool is_data_channel) { + sisl::blob const& key, uint32_t data_size, bool is_data_channel, + int64_t lsn) { + if (is_data_channel) RD_DBG_ASSERT(-1 == lsn, "lsn from data channel should always be -1 , got lsn {}", lsn); + auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); auto rreq = it->second; @@ -555,30 +570,29 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } // rreq->init will allocate the block if it has linked data. - auto status = rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); - if (!rreq->has_linked_data()) { return rreq; } -#ifdef _PRERELEASE - if (is_data_channel) { - if (iomgr_flip::instance()->test_flip("fake_reject_append_data_channel")) { - LOGINFO("Data Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - status = ReplServiceError::NO_SPACE_LEFT; - } - } else { - if (iomgr_flip::instance()->test_flip("fake_reject_append_raft_channel")) { - LOGINFO("Raft Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - status = ReplServiceError::NO_SPACE_LEFT; - } - } -#endif + auto status = init_req_ctx(rreq, rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); + if (status != ReplServiceError::OK) { RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); + if (status == ReplServiceError::NO_SPACE_LEFT && !is_data_channel && !rreq->is_proposer()) { + const auto& chunk_id = rreq->local_blkid().chunk_num(); + RD_LOGD(rkey.traceID, + "For Repl_key=[{}] alloc hints returned error={} when trying to allocate blk on chunk={}", + rkey.to_string(), status, chunk_id); + m_listener->on_no_space_left(lsn, chunk_id); + } else { + RD_LOGD( + rkey.traceID, + "For Repl_key=[{}] alloc hints returned error={}, failing this req, data_channl: {}, is_proposer: {} ", + rkey.to_string(), status, is_data_channel, rreq->is_proposer()); + } // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be // retried. return nullptr; } - RD_LOGD(rreq->traceID(), "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(), + RD_LOGD(rkey.traceID, , "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(), reinterpret_cast< uintptr_t >(rreq.get())); return rreq; } @@ -1008,7 +1022,8 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}", rreq->lsn(), prev_lsn); } - if (!rreq->is_proposer()) { rreq->clear(); } + + if (!rreq->is_proposer()) rreq->clear(); } void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) { @@ -1345,7 +1360,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, auto raft_req = r_cast< nuraft::req_msg* >(param->ctx); auto const& entries = raft_req->log_entries(); - auto start_lsn = raft_req->get_last_log_idx() + 1; + auto start_lsn = to_repl_lsn(raft_req->get_last_log_idx() + 1); if (entries.size() == 0) { RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}", raft_req->get_commit_idx()); @@ -1374,7 +1389,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } // Those LSNs already in logstore but not yet committed, will be dedup here, // applier_create_req will return same req as previous one - auto req = m_state_machine->localize_journal_entry_prepare(*entry); + auto req = m_state_machine->localize_journal_entry_prepare(*entry, lsn); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); // The hint set here will be used by the next after next appendEntry, the next one @@ -1560,7 +1575,8 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); if (need_skip_processing(repl_lsn)) { - RD_LOGI(NO_TRACE_ID, "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); + RD_LOGI(NO_TRACE_ID, + "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); return; } @@ -1619,8 +1635,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->set_lsn(repl_lsn); // keep lentry in scope for the lyfe cycle of the rreq rreq->set_lentry(lentry); - auto status = rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), - data_size, m_listener); + auto status = init_req_ctx(rreq, rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), + entry_to_key(jentry), data_size, m_listener); if (status != ReplServiceError::OK) { RD_LOGE(jentry->traceID, "Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); } @@ -1723,4 +1739,113 @@ void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { } } +void RaftReplDev::pause_statemachine() { + if (!raft_server()->is_state_machine_execution_paused()) { + raft_server()->pause_state_machine_exeuction(); + while (!raft_server()->wait_for_state_machine_pause(100)) { + RD_LOGD(NO_TRACE_ID, "wait for statemachine pause!"); + } + } +} + +void RaftReplDev::resume_statemachine() { + if (raft_server()->is_state_machine_execution_paused()) { + raft_server()->resume_state_machine_execution(); + RD_LOGD(NO_TRACE_ID, "statemachine is resumed!"); + } +} + +void RaftReplDev::quiesce_reqs() { + // all the block allocation happens in rreq->init. so after we wait for all the pending req has been initialized we + // can make sure + // 1 all the pending reqs has allocated their blocks + // 2 no new pending reqs will be initialized again. + m_in_quience.store(true, std::memory_order_release); + RD_LOGD(NO_TRACE_ID, "enter quience state, waiting for all the pending req to be initialized"); + while (true) { + uint64_t pending_req_num = get_pending_init_req_num(); + if (pending_req_num) { + RD_LOGD(NO_TRACE_ID, "wait for {} pending create_req requests to be completed", pending_req_num); + std::this_thread::sleep_for(std::chrono::microseconds(1)); + } else + break; + } +} + +void RaftReplDev::resume_accepting_reqs() { m_in_quience.store(false, std::memory_order_release); } + +void RaftReplDev::clear_chunk_req(chunk_num_t chunk_id) { + RD_LOGD(NO_TRACE_ID, + "start cleaning all the in-memory rreqs, which has allocated blk on the emergent chunk={} before handling " + "no_space_left error", + chunk_id); + std::vector< folly::Future< folly::Unit > > futs; + for (auto& [key, rreq] : m_repl_key_req_map) { + if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = rreq->local_blkid(); + if (chunk_id == blkid.chunk_num()) { + // only clean the rreqs which has allocated blks on the emergent chunk + futs.emplace_back( + std::move(data_service().async_free_blk(blkid).thenValue([this, &blkid, &key](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD(NO_TRACE_ID, "blkid={} freed successfully for handling no_space_left error", + blkid.to_string()); + m_repl_key_req_map.erase(key); // remove from the req map after freeing the blk + }))); + } + } + } + + folly::collectAllUnsafe(futs) + .thenValue([this](auto&& vf) { + // TODO:: handle the error in freeing blk if necessary in the future. + // for nuobject case, error for freeing blk in the emergent chunk can be ingored + RD_LOGD( + NO_TRACE_ID, + "all the necessary in-memory rreqs which has allocated blks on the emergent chunk have been cleaned up " + "successfully, continue to handle no_space_left error."); + }) + // need to wait for the completion + .wait(); +} + +ReplServiceError RaftReplDev::init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener) { + if (!rreq) { + RD_LOGD(rkey.traceID, "got nullptr for initing req, rkey=[{}]", rkey.to_string()); + return ReplServiceError::CANCELLED; + } + + init_req_counter counter(m_pending_init_req_num); + if (is_in_quience()) { + // In quience state, reject any new requests. + RD_LOGD(rkey.traceID, "Rejecting new request in quience state, rkey=[{}]", rkey.to_string()); + return ReplServiceError::QUIENCE_STATE; + } + + return rreq->init(rkey, op_code, is_proposer, user_header, key, data_size, m_listener); +} + +void RaftReplDev::become_leader_cb() { + auto new_gate = raft_server()->get_last_log_idx(); + repl_lsn_t existing_gate = 0; + if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) { + // was a follower, m_traffic_ready_lsn should be zero on follower. + RD_REL_ASSERT(!existing_gate, "existing gate should be zero"); + } + RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); +} + +bool RaftReplDev::is_ready_for_traffic() const { + if (is_stopping()) return false; + auto committed_lsn = m_commit_upto_lsn.load(); + auto gate = m_traffic_ready_lsn.load(); + bool ready = committed_lsn >= gate; + if (!ready) { + RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); + } + return ready; +} } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index aedcd8475..e8aee7f34 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -156,16 +156,29 @@ class nuraft_snapshot_context : public snapshot_context { class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr, public std::enable_shared_from_this< RaftReplDev > { +private: + class init_req_counter { + public: + init_req_counter(std::atomic_uint64_t& counter) : my_counter(counter) { + my_counter.fetch_add(1, std::memory_order_acq_rel); + } + + ~init_req_counter() { my_counter.fetch_sub(1, std::memory_order_acq_rel); } + + private: + std::atomic_uint64_t& my_counter; + }; + private: shared< RaftStateMachine > m_state_machine; RaftReplService& m_repl_svc; folly::ConcurrentHashMap< repl_key, repl_req_ptr_t, repl_key::Hasher > m_repl_key_req_map; nuraft_mesg::Manager& m_msg_mgr; - group_id_t m_group_id; // Replication Group id - std::string m_rdev_name; // Short name for the group for easy debugging + group_id_t m_group_id; // Replication Group id + std::string m_rdev_name; // Short name for the group for easy debugging std::string m_identify_str; // combination of rdev_name:group_id - replica_id_t m_my_repl_id; // This replica's uuid - int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) + replica_id_t m_my_repl_id; // This replica's uuid + int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) shared< ReplLogStore > m_data_journal; shared< HomeLogStore > m_free_blks_journal; sisl::urcu_scoped_ptr< repl_dev_stage_t > m_stage; @@ -176,7 +189,7 @@ class RaftReplDev : public ReplDev, mutable folly::SharedMutexWritePriority m_sb_lock; // Lock to protect staged sb and persisting sb raft_repl_dev_superblk m_sb_in_mem; // Cached version which is used to read and for staging - std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes + std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly committed, to track flushes std::atomic< repl_lsn_t > m_compact_lsn{0}; // LSN upto which it was compacted, it is used to track where to // The `traffic_ready_lsn` variable holds the Log Sequence Number (LSN) up to which // the state machine should committed to before accepting traffic. This threshold ensures that @@ -199,6 +212,10 @@ class RaftReplDev : public ReplDev, static std::atomic< uint64_t > s_next_group_ordinal; bool m_log_store_replay_done{false}; + // pending create requests, including both raft and data channel + std::atomic_uint64_t m_pending_init_req_num; + std::atomic< bool > m_in_quience; + public: friend class RaftStateMachine; @@ -236,22 +253,14 @@ class RaftReplDev : public ReplDev, uint32_t get_blk_size() const override; repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } + repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx() + 1; /*to_repl_lsn*/ } bool is_destroy_pending() const; bool is_destroyed() const; + Clock::time_point destroyed_time() const { return m_destroyed_time; } - bool is_ready_for_traffic() const override { - auto committed_lsn = m_commit_upto_lsn.load(); - auto gate = m_traffic_ready_lsn.load(); - bool ready = committed_lsn >= gate; - if (!ready) { - RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); - } - return ready; - } + bool is_ready_for_traffic() const override; // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. - void purge() override { - RD_REL_ASSERT(false, "NOT SUPPORTED YET"); - } + void purge() override { RD_REL_ASSERT(false, "NOT SUPPORTED YET"); } std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return std::make_shared< nuraft_snapshot_context >(snp_ctx); @@ -270,22 +279,17 @@ class RaftReplDev : public ReplDev, void handle_rollback(repl_req_ptr_t rreq); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, bool is_data_channel); + sisl::blob const& key, uint32_t data_size, bool is_data_channel, + int64_t lsn = -1 /*init lsn*/); folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs); void cp_flush(CP* cp, cshared< ReplDevCPContext > ctx); cshared< ReplDevCPContext > get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); - void become_leader_cb() { - auto new_gate = raft_server()->get_last_log_idx(); - repl_lsn_t existing_gate = 0; - if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) { - // was a follower, m_traffic_ready_lsn should be zero on follower. - RD_REL_ASSERT(existing_gate == 0, "existing gate should be zero"); - } - RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); - }; + + void become_leader_cb(); + void become_follower_cb() { // m_traffic_ready_lsn should be zero on follower. m_traffic_ready_lsn.store(0); @@ -344,15 +348,23 @@ class RaftReplDev : public ReplDev, /** * \brief This method is called to check if the given LSN is within the last snapshot LSN received from the leader. - * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync, which - * doesn't need any more operations (e.g., replay, commit). + * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync, + * which doesn't need any more operations (e.g., replay, commit). * * \param lsn The LSN to be checked. * \return true if the LSN is within the last snapshot LSN, false otherwise. */ - bool need_skip_processing(const repl_lsn_t lsn) { - return lsn <= m_rd_sb->last_snapshot_lsn; - } + bool need_skip_processing(const repl_lsn_t lsn) { return lsn <= m_rd_sb->last_snapshot_lsn; } + + // pause/resume statemachine(commiting thread) + void pause_statemachine(); + void resume_statemachine(); + + void quiesce_reqs(); + void resume_accepting_reqs(); + + // clear reqs that has allocated blks on the given chunk. + void clear_chunk_req(chunk_num_t chunk_id); protected: //////////////// All nuraft::state_mgr overrides /////////////////////// @@ -398,6 +410,13 @@ class RaftReplDev : public ReplDev, bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); void report_blk_metrics_if_needed(repl_req_ptr_t rreq); + ReplServiceError init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener); + + bool is_in_quience() { return m_in_quience.load(std::memory_order_acquire); } + + uint64_t get_pending_init_req_num() { return m_pending_init_req_num.load(std::memory_order_acquire); } }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index ebf262e8a..0e211212a 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -48,7 +48,7 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { return ReplServiceError::OK; } -repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry) { +repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn /*repl_lsn*/) { // Validate the journal entry and see if it needs to be transformed repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry.get_buf().data_begin()); RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, @@ -83,8 +83,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); - rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), - (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */); + rreq = + m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), + (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */, lsn); if (rreq == nullptr) { goto out; } rreq->set_remote_blkid(RemoteBlkId{jentry->server_id, entry_blkid}); @@ -109,7 +110,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr std::memcpy(blkid_location, rreq->local_blkid().serialize().cbytes(), local_size); } else { rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), - jentry->value_size, false /* is_data_channel */); + jentry->value_size, false /* is_data_channel */, lsn); + if (rreq == nullptr) goto out; } // We might have localized the journal entry with new blkid. We need to also update the header/key pointers pointing @@ -154,7 +156,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry auto rreq = m_rd.repl_key_to_req(rkey); if ((rreq == nullptr) || (rreq->is_localize_pending())) { - rreq = localize_journal_entry_prepare(lentry); + rreq = localize_journal_entry_prepare(lentry, + -1 /* lsn=-1, since this is a finish call and we don't have lsn yet */); if (rreq == nullptr) { RELEASE_ASSERT(rreq != nullptr, "We get an linked data for rkey=[{}], jentry=[{}] not as part of Raft Append but " @@ -207,7 +210,8 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt // when reaching here, the config change log has already been committed, and the new config has been applied to the // cluster if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) { - RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); + RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", + log_idx); return; } @@ -333,9 +337,10 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, // uncommitted logs may or may not included in the snapshot data sent by leader, // depending on the racing of commit vs snapshot read, leading to data inconsistency. if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) { - RD_LOGW(NO_TRACE_ID, "not ready to read because there are some uncommitted logs in snapshot, " - "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", - s.get_last_log_idx(), m_rd.get_last_commit_lsn()); + RD_LOGW(NO_TRACE_ID, + "not ready to read because there are some uncommitted logs in snapshot, " + "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", + s.get_last_log_idx(), m_rd.get_last_commit_lsn()); return -1; } diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 7da37d5c5..0de9b2744 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -122,7 +122,7 @@ class RaftStateMachine : public nuraft::state_machine { ////////// APIs outside of nuraft::state_machine requirements //////////////////// ReplServiceError propose_to_raft(repl_req_ptr_t rreq); - repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry); + repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn = -1); repl_req_ptr_t localize_journal_entry_finish(nuraft::log_entry& lentry); void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn); void unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 0a06c7203..8a2e2e6b6 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -68,9 +68,20 @@ class SoloReplDev : public ReplDev { } repl_lsn_t get_last_commit_lsn() const override { return 0; } + repl_lsn_t get_last_append_lsn() override { return 0; }; uint32_t get_blk_size() const override; + // pause/resume statemachine(commiting thread) + void pause_statemachine() override { return; } + void resume_statemachine() override { return; } + + void quiesce_reqs() override { return; } + void resume_accepting_reqs() override { return; } + + // clear reqs that has allocated blks on the given chunk. + void clear_chunk_req(chunk_num_t chunk_id) override { return; } + void cp_flush(CP* cp); void cp_cleanup(CP* cp); diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 7f7345e10..ab40e9ea5 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -111,7 +111,7 @@ TEST_F(RaftReplDevTest, Follower_Fetch_OnActive_ReplicaGroup) { } TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { - g_helper->set_basic_flip("disable_leader_push_data"); + g_helper->set_basic_flip("disable_leader_push_data", std::numeric_limits< int >::max(), 100); LOGINFO("Homestore replica={} setup completed, all the push_data from leader are disabled", g_helper->replica_num()); LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); @@ -125,6 +125,37 @@ TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { this->validate_data(); g_helper->sync_for_cleanup_start(); + g_helper->remove_flip("disable_leader_push_data"); +} + +TEST_F(RaftReplDevTest, Write_With_Handling_No_Space_Left) { + g_helper->set_basic_flip("simulate_no_space_left", std::numeric_limits< int >::max(), 50); + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + // this test is slow, so use a smaller number of entries to write in each attempt + uint64_t entries_per_attempt = 50; + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); + + LOGINFO("Restart all the homestore replicas"); + g_helper->restart(); + g_helper->sync_for_test_start(); + + // Reassign the leader to replica 0, in case restart switched leaders + this->assign_leader(0); + + LOGINFO("Post restart write the data again on the leader"); + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + LOGINFO("Validate all data written (including pre-restart data) by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); + g_helper->remove_flip("simulate_no_space_left"); } #endif From f8f6a24d0701d9baca197e4616c099745039c8cc Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Fri, 18 Apr 2025 09:57:26 +0800 Subject: [PATCH 094/130] Adjust grpc message size according to fetch data limit as well (#691) Co-authored-by: yawzhang --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 5 +++-- src/lib/homestore.cpp | 10 +++++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index e95a96deb..30b17ff19 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.5" + version = "6.9.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 33f0ae77e..7996332d2 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -255,8 +255,9 @@ table Consensus { // Max append batch size max_append_batch_size: int32 = 64; - // Max grpc message size - max_grpc_message_size: int32 = 67108864; + // Max grpc message size, use 64M (max data size on data channel) + 128M (max snasphot batch size) + 1M + // Please adjust it if data_fetch_max_size_kb is increased as well + max_grpc_message_size: int32 = 202375168; // Threshold of log gap from leader to consider a replica as stale stale_log_gap_hi_threshold: int32 = 200; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 3f8ee3737..58ef4d9b8 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -165,11 +165,15 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ HomeStoreDynamicConfig::init_settings_default(); // Check if the max_grpc_message_size is large enough to hold the data and snapshot batch size + auto data_fetch_max_size_in_byte = HS_DYNAMIC_CONFIG(consensus.data_fetch_max_size_kb) * 1024ull; + RELEASE_ASSERT(data_fetch_max_size_in_byte <= INT_MAX, "data fetch size is larger than the grpc limit"); if (HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_data_size || - HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size) { - LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {} and max_snapshot_batch_size {}", + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size || + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < s_cast< int >(data_fetch_max_size_in_byte)) { + LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {}, max_snapshot_batch_size {} and " + "data_fetch_max_size {}", HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, - input.max_snapshot_batch_size); + input.max_snapshot_batch_size, data_fetch_max_size_in_byte); throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes"); } From d25e679c297aa1bda50de26d4ddeecd3d72e418c Mon Sep 17 00:00:00 2001 From: Sanal Date: Mon, 21 Apr 2025 23:55:52 +0530 Subject: [PATCH 095/130] Add additional on_commit repldev listener api's. (#692) Add additional on_commit to support vector of blkids. --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 21 ++++++++-- .../replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- .../replication/repl_dev/solo_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 2 +- src/tests/test_common/raft_repl_test_base.hpp | 39 ++++++++++--------- src/tests/test_solo_repl_dev.cpp | 6 ++- 8 files changed, 49 insertions(+), 27 deletions(-) diff --git a/conanfile.py b/conanfile.py index 30b17ff19..aaaf1a1cd 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.6" + version = "6.10.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index cfb24e48a..4e3d4c428 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -270,6 +270,20 @@ class ReplDevListener { virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief Called when the log entry has been committed in the replica set. + /// + /// This function is called from a dedicated commit thread which is different from the original thread calling + /// replica_set::write(). There is only one commit thread, and lsn is guaranteed to be monotonically increasing. + /// + /// @param lsn - The log sequence number + /// @param header - Header originally passed with replica_set::write() api + /// @param key - Key originally passed with replica_set::write() api + /// @param blkids - List of independent blkids where data is written to the storage engine. + /// @param ctx - Context passed as part of the replica_set::write() api + /// + virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief Called when the log entry has been received by the replica dev. /// /// On recovery, this is called from a random worker thread before the raft server is started. It is @@ -416,10 +430,11 @@ class ReplDev { /// cases /// @param value - vector of io buffers that contain value for the key. It is an optional field and if the value /// list size is 0, then only key is written to replicadev without data. - /// @param ctx - User supplied context which will be passed to listener - /// callbacks + /// @param ctx - User supplied context which will be passed to listener callbacks + /// @param part_of_batch Is write is part of a batch. If part of the batch, then submit_batch needs to be called at + /// the end virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx, trace_id_t tid = 0) = 0; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) = 0; /// @brief Reads the data and returns a future to continue on /// @param bid Block id to read diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 99e61332d..2322b7721 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -319,7 +319,7 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< } void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data, - repl_req_ptr_t rreq, trace_id_t tid) { + repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e8aee7f34..696e98737 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -230,7 +230,7 @@ class RaftReplDev : public ReplDev, //////////////// All ReplDev overrides/implementation /////////////////////// void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx, trace_id_t tid = 0) override; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 58aa69a96..f09796352 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -28,7 +28,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi } void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t rreq, trace_id_t tid) { + repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } incr_pending_request_num(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 8a2e2e6b6..f4572ce0e 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -40,7 +40,7 @@ class SoloReplDev : public ReplDev { virtual ~SoloReplDev() = default; void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx, trace_id_t tid = 0) override; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false, trace_id_t tid = 0) override; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 47778d9a8..636fa5f7c 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -94,7 +94,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { struct journal_header { uint64_t data_size; uint64_t data_pattern; - uint64_t key_id; //put it in header to test duplication in alloc_local_blks + uint64_t key_id; // put it in header to test duplication in alloc_local_blks }; journal_header jheader; uint64_t key_id; @@ -151,6 +151,9 @@ class TestReplicatedDB : public homestore::ReplDevListener { if (ctx->is_proposer()) { g_helper->runner().next_task(); } } + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} + bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, @@ -172,7 +175,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), *(r_cast< uint64_t const* >(key.cbytes()))); - g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper(error)); + g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper< ReplServiceError >(error)); } AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { @@ -318,8 +321,9 @@ class TestReplicatedDB : public homestore::ReplDevListener { void free_user_snp_ctx(void*& user_snp_ctx) override {} - ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { - auto jheader = r_cast(header.cbytes()); + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) override { + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); Key k{.id_ = jheader->key_id}; auto iter = inmem_db_.find(k); if (iter != inmem_db_.end()) { @@ -357,7 +361,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); } - repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, s_uniq_num); + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, false, s_uniq_num); } void validate_db_data() { @@ -590,7 +594,8 @@ class RaftReplDevTestBase : public testing::Test { LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); g_helper->runner().set_task([this, block_size, db, data_size]() { static std::normal_distribution<> num_blks_gen{3.0, 2.0}; - uint64_t size = data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size; + uint64_t size = + data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size; this->generate_writes(size, block_size, db); }); if (wait_for_commit) { g_helper->runner().execute().get(); } @@ -631,11 +636,11 @@ class RaftReplDevTestBase : public testing::Test { auto data_size = std::max(1L, std::abs(std::lround(num_blks_gen(g_re)))) * block_size; ASSERT_GT(data_size, 0); LOGINFO("data_size larger than 0, go ahead, data_size= {}.", data_size); - static std::atomic s_uniq_num{0}; + static std::atomic< uint32_t > s_uniq_num{0}; auto req = intrusive(new TestReplicatedDB::test_req()); req->jheader.data_size = data_size; req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; - //overwrite the key_id with the id passed in + // overwrite the key_id with the id passed in req->jheader.key_id = id; req->key_id = id; @@ -650,17 +655,15 @@ class RaftReplDevTestBase : public testing::Test { db->repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); }); - if (!wait_for_commit) { - return ReplServiceError::OK; + if (!wait_for_commit) { return ReplServiceError::OK; } + try { + g_helper->runner().execute().get(); + LOGDEBUG("write data task complete, id={}", id) + } catch (const ReplServiceError& e) { + LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), id, + enum_name(e)); + return e; } - try { - g_helper->runner().execute().get(); - LOGDEBUG("write data task complete, id={}", id) - } catch (const ReplServiceError& e) { - LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), - id, enum_name(e)); - return e; - } written_entries_ += 1; LOGINFO("wait_for_commit={}", written_entries_); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index a192f54b1..2ec091795 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -108,6 +108,9 @@ class SoloReplDevTest : public testing::Test { } } + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} + AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { return make_async_success<>(); } @@ -126,7 +129,8 @@ class SoloReplDevTest : public testing::Test { void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override {} - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) override { return blk_alloc_hints{}; } From 1455adc69e94115ca63858a9451c630802fa3b71 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 23 Apr 2025 00:39:15 +0800 Subject: [PATCH 096/130] reduce io number in simulate_no_space_left and disable_leader_push_data flip test (#694) --- conanfile.py | 2 +- src/tests/test_raft_repl_dev.cpp | 20 +++----------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/conanfile.py b/conanfile.py index aaaf1a1cd..1b0d7477c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.0" + version = "6.10.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index ab40e9ea5..6e21a64e8 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -117,7 +117,7 @@ TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); - this->write_on_leader(100, true /* wait_for_commit */); + this->write_on_leader(20, true /* wait_for_commit */); g_helper->sync_for_verify_start(); @@ -133,27 +133,13 @@ TEST_F(RaftReplDevTest, Write_With_Handling_No_Space_Left) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); - // this test is slow, so use a smaller number of entries to write in each attempt - uint64_t entries_per_attempt = 50; - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + this->write_on_leader(20, true /* wait_for_commit */); g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); this->validate_data(); - g_helper->sync_for_cleanup_start(); - - LOGINFO("Restart all the homestore replicas"); - g_helper->restart(); - g_helper->sync_for_test_start(); - // Reassign the leader to replica 0, in case restart switched leaders - this->assign_leader(0); - - LOGINFO("Post restart write the data again on the leader"); - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); - - LOGINFO("Validate all data written (including pre-restart data) by reading them"); - this->validate_data(); g_helper->sync_for_cleanup_start(); g_helper->remove_flip("simulate_no_space_left"); } From c3292a40865bb994a00e0782a0b9d76102d14a7c Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 22 Apr 2025 16:41:13 -0700 Subject: [PATCH 097/130] Fix nightly Jenkins project (#697) --- .jenkins/jenkinsfile_nightly | 38 +++++++++++++++++++++--------------- conanfile.py | 2 +- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/.jenkins/jenkinsfile_nightly b/.jenkins/jenkinsfile_nightly index 7efd9b935..7100a0230 100644 --- a/.jenkins/jenkinsfile_nightly +++ b/.jenkins/jenkinsfile_nightly @@ -1,5 +1,5 @@ pipeline { - agent { label 'sds-builder-2204' } + agent { label 'sds-builder-v5' } triggers { cron('TZ=US/Pacific\nH H(0-2) * * *') } @@ -8,7 +8,7 @@ pipeline { ORG = 'sds' ECR_URL = 'hub.tess.io' ARTIFACTORY_PASS = credentials('ARTIFACTORY_PASS') - CONAN_USER = 'sds' + CONAN_USER = 'oss' failed_stage = "" } stages { @@ -26,6 +26,7 @@ pipeline { VER = sh(script: "grep -m 1 ' version =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true) NIGHTLY_TAG = "master-nightly-debug-4.0" ECR_PATH = "${ECR_URL}/${ORG}/${PROJECT}" + CONAN_FLAGS="--name ${PROJECT} --user ${CONAN_USER} --channel ${NIGHTLY_TAG}" failed_stage = "" } } @@ -40,20 +41,25 @@ pipeline { } stage("Build") { steps { - sh "conan create --build missing -o homestore:sanitize=True -pr debug . ${PROJECT}/${VER}@" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_btree' -exec cp {} .jenkins/test_index_btree \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store' -exec cp {} .jenkins/test_log_store \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_data_service' -exec cp {} .jenkins/test_data_service \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; " + sh ''' + hostname + echo $NODE_NAME + conan create --build missing -s:h build_type=Debug -o ${PROJECT}/*:sanitize=True ${CONAN_FLAGS} . + + find /home/jenkins -type f -wholename '*/test_index_btree' -exec cp {} .jenkins/test_index_btree \\; + find /home/jenkins -type f -wholename '*/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\; + find /home/jenkins -type f -wholename '*/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\; + find /home/jenkins -type f -wholename '*/test_log_store' -exec cp {} .jenkins/test_log_store \\; + find /home/jenkins -type f -wholename '*/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\; + find /home/jenkins -type f -wholename '*/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\; + find /home/jenkins -type f -wholename '*/test_data_service' -exec cp {} .jenkins/test_data_service \\; + find /home/jenkins -type f -wholename '*/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\; + find /home/jenkins -type f -wholename '*/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\; + find /home/jenkins -type f -wholename '*/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; + find /home/jenkins -type f -wholename '*/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; + find /home/jenkins -type f -wholename '*/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; + find /home/jenkins -type f -wholename '*/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; + ''' } post { failure { diff --git a/conanfile.py b/conanfile.py index 1b0d7477c..0fcf06b92 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.1" + version = "6.10.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From a1e636731737ae92396a6c6ee698e294cf96df80 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 23 Apr 2025 16:48:34 +0800 Subject: [PATCH 098/130] fix repl lsn (#699) --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- src/lib/replication/repl_dev/raft_state_machine.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index 0fcf06b92..646100c57 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.2" + version = "6.10.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2322b7721..06a56083b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1360,7 +1360,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, auto raft_req = r_cast< nuraft::req_msg* >(param->ctx); auto const& entries = raft_req->log_entries(); - auto start_lsn = to_repl_lsn(raft_req->get_last_log_idx() + 1); + auto start_lsn = raft_req->get_last_log_idx() + 1; if (entries.size() == 0) { RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}", raft_req->get_commit_idx()); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 696e98737..19b672e7b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -253,7 +253,7 @@ class RaftReplDev : public ReplDev, uint32_t get_blk_size() const override; repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } - repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx() + 1; /*to_repl_lsn*/ } + repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } bool is_destroy_pending() const; bool is_destroyed() const; diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 0e211212a..b5f9099f4 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -48,7 +48,7 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { return ReplServiceError::OK; } -repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn /*repl_lsn*/) { +repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn) { // Validate the journal entry and see if it needs to be transformed repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry.get_buf().data_begin()); RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, From ba4553caa40cdf413ceeee728710de9455851f17 Mon Sep 17 00:00:00 2001 From: Sanal Date: Thu, 24 Apr 2025 09:33:15 -0700 Subject: [PATCH 099/130] Make a single on_commit listener function. (#700) Make a single on_commit listener function. List of multiblkids could point to different contigious areas of data. --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 16 +--------------- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.cpp | 4 ++-- src/tests/test_common/raft_repl_test_base.hpp | 10 ++++------ src/tests/test_solo_repl_dev.cpp | 12 +++++------- 6 files changed, 14 insertions(+), 32 deletions(-) diff --git a/conanfile.py b/conanfile.py index 646100c57..02639fa95 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.3" + version = "6.11.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 4e3d4c428..c0ac700fd 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -256,20 +256,6 @@ class ReplDevListener { void set_repl_dev(shared< ReplDev > rdev) { m_repl_dev = rdev; } shared< ReplDev > repl_dev() { return m_repl_dev.lock(); } - /// @brief Called when the log entry has been committed in the replica set. - /// - /// This function is called from a dedicated commit thread which is different from the original thread calling - /// replica_set::write(). There is only one commit thread, and lsn is guaranteed to be monotonically increasing. - /// - /// @param lsn - The log sequence number - /// @param header - Header originally passed with replica_set::write() api - /// @param key - Key originally passed with replica_set::write() api - /// @param blkids - List of blkids where data is written to the storage engine. - /// @param ctx - Context passed as part of the replica_set::write() api - /// - virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) = 0; - /// @brief Called when the log entry has been committed in the replica set. /// /// This function is called from a dedicated commit thread which is different from the original thread calling @@ -402,7 +388,7 @@ class ReplDevListener { virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) { return; } /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id){}; + virtual void on_log_replay_done(const group_id_t& group_id) {}; private: std::weak_ptr< ReplDev > m_repl_dev; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 06a56083b..db12a27f6 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1013,7 +1013,7 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { replace_member(rreq); } else { - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); } if (!recovery) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index f09796352..7c57ef322 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -59,7 +59,7 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } data_service().commit_blk(rreq->local_blkid()); - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); decr_pending_request_num(); }); } @@ -90,7 +90,7 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - m_listener->on_commit(lsn, header, key, blkid, nullptr); + m_listener->on_commit(lsn, header, key, {blkid}, nullptr); } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 636fa5f7c..6a4be1b41 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -125,16 +125,17 @@ class TestReplicatedDB : public homestore::ReplDevListener { TestReplicatedDB() = default; virtual ~TestReplicatedDB() = default; - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); + ASSERT_EQ(blkids.size(), 1); auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; Value v{.lsn_ = lsn, .data_size_ = jheader->data_size, .data_pattern_ = jheader->data_pattern, - .blkid_ = blkids, + .blkid_ = blkids[0], .id_ = k.id_}; LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", @@ -151,9 +152,6 @@ class TestReplicatedDB : public homestore::ReplDevListener { if (ctx->is_proposer()) { g_helper->runner().next_task(); } } - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, - std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} - bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 2ec091795..5064f738a 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -96,21 +96,19 @@ class SoloReplDevTest : public testing::Test { Listener(SoloReplDevTest& test) : m_test{test} {} virtual ~Listener() = default; - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received on_commit lsn={}", lsn); + HS_REL_ASSERT(!blkids.empty(), "Invalid blkids size"); if (ctx == nullptr) { - m_test.validate_replay(*repl_dev(), lsn, header, key, blkids); + m_test.validate_replay(*repl_dev(), lsn, header, key, blkids[0]); } else { auto req = boost::static_pointer_cast< test_repl_req >(ctx); - req->written_blkids = std::move(blkids); + req->written_blkids = blkids[0]; m_test.on_write_complete(*repl_dev(), req); } } - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, - std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} - AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { return make_async_success<>(); } From e344d4edbab9fa9e79159db10d29bc7855dbedfe Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Mon, 21 Apr 2025 14:42:55 -0700 Subject: [PATCH 100/130] Add unit test to trigger eviction --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 6 ++++- src/lib/homestore.cpp | 2 +- src/lib/index/inplace_btree/index_cp.hpp | 10 +++++--- src/lib/index/inplace_btree/wb_cache.cpp | 4 ++-- src/tests/test_btree_long_running | 30 +++++++++++++++++++++++- 6 files changed, 45 insertions(+), 9 deletions(-) diff --git a/conanfile.py b/conanfile.py index 02639fa95..675a0fd91 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.11.0" + version = "6.11.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 7996332d2..df950c608 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -143,7 +143,11 @@ table Generic { cache_max_throttle_cnt : uint32 = 4; // writeback cache max q depth - cache_min_throttle_cnt : uint32 = 4; // writeback cache min q deoth + cache_min_throttle_cnt : uint32 = 4; // writeback cache min q depth + + cache_hashmap_nbuckets : uint32 = 1000000; // num buckets for sisl::SimpleHashmap used in wbcache + + cache_evictor_npartitions: uint32 = 1000; // num partitions for lru evictor in the cache // if this value is set to 0, no sanity check will be run; sanity_check_level: uint32 = 1 (hotswap); diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 58ef4d9b8..403bad2f4 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -286,7 +286,7 @@ void HomeStore::do_start() { const auto& inp_params = HomeStoreStaticConfig::instance().input; uint64_t cache_size = resource_mgr().get_cache_size(); - m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, 1000); + m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, HS_DYNAMIC_CONFIG(generic.cache_evictor_npartitions)); if (m_before_services_starting_cb) { m_before_services_starting_cb(); } diff --git a/src/lib/index/inplace_btree/index_cp.hpp b/src/lib/index/inplace_btree/index_cp.hpp index b04b8f052..619e4c82c 100644 --- a/src/lib/index/inplace_btree/index_cp.hpp +++ b/src/lib/index/inplace_btree/index_cp.hpp @@ -131,13 +131,15 @@ struct IndexCPContext : public VDevCPContext { }; #pragma pack() + using dirty_buf_entry_t = std::pair< IndexBufferPtr, BtreeNodePtr >; + public: std::atomic< uint64_t > m_num_nodes_added{0}; std::atomic< uint64_t > m_num_nodes_removed{0}; - sisl::ConcurrentInsertVector< IndexBufferPtr > m_dirty_buf_list; + sisl::ConcurrentInsertVector< dirty_buf_entry_t > m_dirty_buf_list; sisl::atomic_counter< int64_t > m_dirty_buf_count{0}; std::mutex m_flush_buffer_mtx; - sisl::ConcurrentInsertVector< IndexBufferPtr >::iterator m_dirty_buf_it; + sisl::ConcurrentInsertVector< dirty_buf_entry_t >::iterator m_dirty_buf_it; iomgr::FiberManagerLib::mutex m_txn_journal_mtx; sisl::io_blob_safe m_txn_journal_buf; @@ -154,7 +156,9 @@ struct IndexCPContext : public VDevCPContext { sisl::io_blob_safe const& journal_buf() const { return m_txn_journal_buf; } - void add_to_dirty_list(const IndexBufferPtr& buf); + // The BtreeNodePtr is added added only to increment the ref count + // which is used by the wbcache to evict the node + void add_to_dirty_list(const IndexBufferPtr& buf, const BtreeNodePtr& node); bool any_dirty_buffers() const; void prepare_flush_iteration(); std::optional< IndexBufferPtr > next_dirty(); diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index 1f9563060..ad9eb1be7 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -43,7 +43,7 @@ IndexWBCacheBase& wb_cache() { IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size) : m_vdev{vdev}, - m_cache{evictor, 100000, node_size, + m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size, [](const BtreeNodePtr& node) -> BlkId { return static_cast< IndexBtreeNode* >(node.get())->m_idx_buf->m_blkid; }, @@ -130,7 +130,7 @@ void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf } else { if (node != nullptr) { m_cache.upsert(node); } LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string()); - r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); + r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf, node); resource_mgr().inc_dirty_buf_size(m_node_size); } } diff --git a/src/tests/test_btree_long_running b/src/tests/test_btree_long_running index 2e24d18bf..9146064e5 100644 --- a/src/tests/test_btree_long_running +++ b/src/tests/test_btree_long_running @@ -39,7 +39,7 @@ SISL_OPTION_GROUP( (num_iters, "", "num_iters", "number of iterations for rand ops", ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", @@ -269,6 +269,34 @@ TYPED_TEST(BtreeTest, RandomInsert) { this->get_all(); } +TYPED_TEST(BtreeTest, TriggerCacheEviction) { + // restart homestore with smaller cache % + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.resource_limits.cache_size_percent = 1u; + HS_SETTINGS_FACTORY().save(); + }); + + this->restart_homestore(); + + LOGINFO("TriggerCacheEviction test start"); + const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + LOGINFO("Step 1: Do insert for {} entries", num_entries); + for (uint32_t i{0}; i < num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + // this->print(); + } + + this->get_all(); + + // reset cache pct + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.resource_limits.cache_size_percent = 65u; + HS_SETTINGS_FACTORY().save(); + }); + + LOGINFO("TriggerCacheEviction test end"); +} + TYPED_TEST(BtreeTest, SequentialRemove) { LOGINFO("SequentialRemove test start"); // Forward sequential insert From 3bcb2ffddc966f3fa838baa6ac0e46283af483d6 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Thu, 24 Apr 2025 15:29:38 -0700 Subject: [PATCH 101/130] Use the index_buffer.is_clean() to determine if it is safe to evict a btree node from cache --- src/lib/index/inplace_btree/index_cp.hpp | 10 +++------- src/lib/index/inplace_btree/wb_cache.cpp | 4 ++-- src/tests/test_btree_long_running | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/lib/index/inplace_btree/index_cp.hpp b/src/lib/index/inplace_btree/index_cp.hpp index 619e4c82c..b04b8f052 100644 --- a/src/lib/index/inplace_btree/index_cp.hpp +++ b/src/lib/index/inplace_btree/index_cp.hpp @@ -131,15 +131,13 @@ struct IndexCPContext : public VDevCPContext { }; #pragma pack() - using dirty_buf_entry_t = std::pair< IndexBufferPtr, BtreeNodePtr >; - public: std::atomic< uint64_t > m_num_nodes_added{0}; std::atomic< uint64_t > m_num_nodes_removed{0}; - sisl::ConcurrentInsertVector< dirty_buf_entry_t > m_dirty_buf_list; + sisl::ConcurrentInsertVector< IndexBufferPtr > m_dirty_buf_list; sisl::atomic_counter< int64_t > m_dirty_buf_count{0}; std::mutex m_flush_buffer_mtx; - sisl::ConcurrentInsertVector< dirty_buf_entry_t >::iterator m_dirty_buf_it; + sisl::ConcurrentInsertVector< IndexBufferPtr >::iterator m_dirty_buf_it; iomgr::FiberManagerLib::mutex m_txn_journal_mtx; sisl::io_blob_safe m_txn_journal_buf; @@ -156,9 +154,7 @@ struct IndexCPContext : public VDevCPContext { sisl::io_blob_safe const& journal_buf() const { return m_txn_journal_buf; } - // The BtreeNodePtr is added added only to increment the ref count - // which is used by the wbcache to evict the node - void add_to_dirty_list(const IndexBufferPtr& buf, const BtreeNodePtr& node); + void add_to_dirty_list(const IndexBufferPtr& buf); bool any_dirty_buffers() const; void prepare_flush_iteration(); std::optional< IndexBufferPtr > next_dirty(); diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index ad9eb1be7..793c2e8ef 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -49,7 +49,7 @@ IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< }, [](const sisl::CacheRecord& rec) -> bool { const auto& hnode = (sisl::SingleEntryHashNode< BtreeNodePtr >&)rec; - return (hnode.m_value->m_refcount.test_le(1)); + return static_cast< IndexBtreeNode* >(hnode.m_value.get())->m_idx_buf->is_clean(); }}, m_node_size{node_size}, m_meta_blk{sb.first} { @@ -130,7 +130,7 @@ void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf } else { if (node != nullptr) { m_cache.upsert(node); } LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string()); - r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf, node); + r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); resource_mgr().inc_dirty_buf_size(m_node_size); } } diff --git a/src/tests/test_btree_long_running b/src/tests/test_btree_long_running index 9146064e5..380a906ab 100644 --- a/src/tests/test_btree_long_running +++ b/src/tests/test_btree_long_running @@ -39,7 +39,7 @@ SISL_OPTION_GROUP( (num_iters, "", "num_iters", "number of iterations for rand ops", ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("7000"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", From 7e8316acb924d6f3a676ddae7025a18504c09404 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Thu, 24 Apr 2025 21:16:21 -0700 Subject: [PATCH 102/130] Issue 696 Support remove_repl_dev for solo repl dev (#698) * Issue 696 Support remove_repl_dev for solo repl dev * trigger destroy repl dev listener in solo repl dev remove api --- conanfile.py | 2 +- src/include/homestore/replication_service.hpp | 4 +++ .../replication/repl_dev/raft_repl_dev.cpp | 1 + .../replication/service/generic_repl_svc.cpp | 27 +++++++++++++++++-- .../replication/service/raft_repl_service.cpp | 7 ++++- src/tests/test_common/hs_repl_test_common.hpp | 2 ++ src/tests/test_solo_repl_dev.cpp | 1 + 7 files changed, 40 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index 675a0fd91..1cd3fdbb9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.11.1" + version = "6.12.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index bac805dd5..448bb9afe 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -75,6 +75,10 @@ class ReplApplication { // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0; + // Called when the repl dev is destroyed. This interface provides the application a chance to cleanup any resources + // assocated with this listener; + virtual void destroy_repl_dev_listener(group_id_t group_id) = 0; + // Called after all the repl devs are found upon restart of the homestore instance. // it is a nice place for upper layer to recovery anything depends on repl_devs virtual void on_repl_devs_init_completed() = 0; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index db12a27f6..ff77e718a 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1320,6 +1320,7 @@ void RaftReplDev::permanent_destroy() { m_data_journal->remove_store(); logstore_service().destroy_log_dev(m_data_journal->logdev_id()); m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::PERMANENT_DESTROYED; }); + // we should destroy repl_dev superblk only after all the resources are cleaned up, so that is crash recovery // occurs, we have a chance to find the stale repl_dev and reclaim all the stale resources. m_rd_sb.destroy(); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index c1263b7fb..b5e1f15c0 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "common/homestore_assert.hpp" #include "replication/service/generic_repl_svc.h" #include "replication/service/raft_repl_service.h" @@ -80,7 +81,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService() {}; +SoloReplService::~SoloReplService(){}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -146,7 +147,29 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t } folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_t group_id) { - return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::NOT_IMPLEMENTED); + // RD_LOGI("Removing repl dev for group_id={}", boost::uuids::to_string(group_id)); + auto rdev = get_repl_dev(group_id); + if (rdev.hasError()) { return folly::makeSemiFuture(rdev.error()); } + + auto rdev_ptr = rdev.value(); + + // 1. Firstly stop the repl dev which waits for any outstanding requests to finish + rdev_ptr->stop(); + + // 2. detaches both ways: + // detach rdev from its listener and listener from rdev; + rdev_ptr->detach_listener(); + { + // 3. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to + // this instance; + std::unique_lock lg(m_rd_map_mtx); + m_rd_map.erase(group_id); + } + + // 4. now destroy the upper layer's listener instance; + m_repl_app->destroy_repl_dev_listener(group_id); + + return folly::makeSemiFuture(ReplServiceError::OK); } void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index abfe84fda..244570c24 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -392,7 +392,10 @@ folly::SemiFuture< ReplServiceError > RaftReplService::remove_repl_dev(group_id_ auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::SERVER_NOT_FOUND); } - return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); + auto ret = std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); + + decr_pending_request_num(); + return ret; } void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { @@ -559,6 +562,8 @@ void RaftReplService::gc_repl_devs() { // Therefore, we perform it outside the lock scope and then remove group from m_rd_map. for (const auto& group_id : groups_to_leave) { m_msg_mgr->leave_group(group_id); + // notify consumer to cleanup any resources associated with the listener itself; + m_repl_app->destroy_repl_dev_listener(group_id); { std::unique_lock lg(m_rd_map_mtx); m_rd_map.erase(group_id); diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 7b93cccb2..4393b13d5 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -115,6 +115,8 @@ class HSReplTestHelper : public HSTestHelper { create_repl_dev_listener(homestore::group_id_t group_id) override { return helper_.get_listener(group_id); } + void destroy_repl_dev_listener(homestore::group_id_t) override {} + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override { diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 5064f738a..9c891f0a0 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -155,6 +155,7 @@ class SoloReplDevTest : public testing::Test { shared< ReplDevListener > create_repl_dev_listener(uuid_t) override { return std::make_shared< Listener >(m_test); } + void destroy_repl_dev_listener(uuid_t) override {} void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); } replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); } From 91442fd99bc91503d1378ba47f1df071fa041b19 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Fri, 25 Apr 2025 15:09:31 +0800 Subject: [PATCH 103/130] Set priority when create RaftReplDev (#695) --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 6 ++ src/lib/common/homestore_config.fbs | 6 ++ .../replication/repl_dev/raft_repl_dev.cpp | 9 ++- src/lib/replication/repl_dev/solo_repl_dev.h | 7 +- .../replication/service/generic_repl_svc.cpp | 4 +- .../replication/service/raft_repl_service.cpp | 21 +++++- .../replication/service/raft_repl_service.h | 4 + src/tests/test_common/hs_repl_test_common.hpp | 17 +++++ src/tests/test_raft_repl_dev.cpp | 74 +++++++++++++++++++ 10 files changed, 140 insertions(+), 10 deletions(-) diff --git a/conanfile.py b/conanfile.py index 1cd3fdbb9..f73e1da09 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.1" + version = "6.12.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 0602eecfa..bd18d4765 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -74,6 +74,12 @@ struct peer_info { uint64_t replication_idx_; // The elapsed time since the last successful response from this peer, set to 0 on leader uint64_t last_succ_resp_us_; + // The priority for leader election + uint32_t priority_; + // The peer is learner or not + bool is_learner_; + // The peer is new joiner or not + bool is_new_joiner_; }; struct replica_member_info { diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index df950c608..df90c1342 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -304,6 +304,12 @@ table Consensus { // Reading snapshot objects will be done by a background thread asynchronously // instead of synchronous read by Raft worker threads use_bg_thread_for_snapshot_io: bool = true; + + // Maximum number of election timeout rounds to wait during a prioritized leader election process. + // Every election timeout will compare its priority with the target_priority(max priority of the peers initially) + // then decay the target_priority and wait again until its priority >= target_priority. This setting helps us to set proper priority for peers. + // 0 means all members have the same priority. + max_wait_rounds_of_priority_election: uint32 = 2; } table HomeStoreSettings { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index ff77e718a..88db1263b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1148,7 +1148,10 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { for (auto const& pinfo : rep_status) { pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_}); + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_, + .is_learner_ = pinfo.is_learner_, + .is_new_joiner_ = pinfo.is_new_joiner_}); } return pi; } @@ -1250,8 +1253,8 @@ nuraft::ptr< nuraft::cluster_config > RaftReplDev::load_config() { if (!js.contains("config")) { auto cluster_conf = nuraft::cs_new< nuraft::cluster_config >(); - cluster_conf->get_servers().push_back( - nuraft::cs_new< nuraft::srv_config >(m_raft_server_id, my_replica_id_str())); + cluster_conf->get_servers().push_back(nuraft::cs_new< nuraft::srv_config >( + m_raft_server_id, 0, my_replica_id_str(), "", false, raft_leader_priority)); js["config"] = serialize_cluster_config(*cluster_conf); } return deserialize_cluster_config(js["config"]); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index f4572ce0e..80bfc54bf 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -51,7 +51,12 @@ class SoloReplDev : public ReplDev { bool is_leader() const override { return true; } replica_id_t get_leader_id() const override { return m_group_id; } std::vector< peer_info > get_replication_status() const override { - return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}}; + return std::vector< peer_info >{peer_info{.id_ = m_group_id, + .replication_idx_ = 0, + .last_succ_resp_us_ = 0, + .priority_ = 1, + .is_learner_ = false, + .is_new_joiner_ = false}}; } bool is_ready_for_traffic() const override { return true; } void purge() override {} diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index b5e1f15c0..4c76bef75 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -119,8 +119,8 @@ void SoloReplService::stop() { hs()->data_service().stop(); } -AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, - std::set< replica_id_t > const& members) { +AsyncReplResult< shared< ReplDev > > +SoloReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.create(); rd_sb->group_id = group_id; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 244570c24..bda83e600 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -59,6 +59,17 @@ ReplServiceError RaftReplService::to_repl_error(nuraft::cmd_result_code code) { return ret; } +// NuRaft priority decay coefficient is set to 0.8(currently not configurable). For more details, please refer to +// https://github.com/eBay/NuRaft/blob/master/docs/leader_election_priority.md +int32_t RaftReplService::compute_raft_follower_priority() { + auto max_wait_round = std::min(raft_priority_election_round_upper_limit, + HS_DYNAMIC_CONFIG(consensus.max_wait_rounds_of_priority_election)); + if (max_wait_round == 0) { return raft_leader_priority; } + auto priority = 1 + static_cast< int32_t >( + std::ceil(raft_leader_priority * std::pow(raft_priority_decay_coefficient, max_wait_round))); + return priority; +} + RaftReplService::RaftReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} { m_config_sb_bufs.reserve(100); meta_service().register_handler( @@ -333,14 +344,18 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t return make_async_error< shared< ReplDev > >(to_repl_error(status.error())); } + auto follower_priority = compute_raft_follower_priority(); + auto my_id = m_repl_app->get_my_repl_id(); for (auto& member : members) { if (member == my_id) { continue; } // Skip myself do { - auto const result = m_msg_mgr->add_member(group_id, member).get(); + auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, boost::uuids::to_string(member), "", + false, follower_priority); + auto const result = m_msg_mgr->add_member(group_id, srv_config).get(); if (result) { - LOGINFOMOD(replication, "Groupid={}, new member={} added", boost::uuids::to_string(group_id), - boost::uuids::to_string(member)); + LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", boost::uuids::to_string(group_id), + boost::uuids::to_string(member), follower_priority); break; } else if (result.error() != nuraft::CONFIG_CHANGING) { LOGWARNMOD(replication, "Groupid={}, add member={} failed with error={}", diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 9a53ad07d..84affd264 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -33,6 +33,9 @@ namespace homestore { constexpr auto cert_change_timeout = std::chrono::seconds(1200); constexpr auto cert_check_sleep = std::chrono::seconds(1); +constexpr int32_t raft_leader_priority = 100; +constexpr double raft_priority_decay_coefficient = 0.8; +constexpr uint32_t raft_priority_election_round_upper_limit = 5; struct repl_dev_superblk; class RaftReplDev; @@ -56,6 +59,7 @@ class RaftReplService : public GenericReplService, RaftReplService(cshared< ReplApplication >& repl_app); static ReplServiceError to_repl_error(nuraft::cmd_result_code code); + int32_t compute_raft_follower_priority(); ///////////////////// Overrides of nuraft_mesg::MessagingApplication //////////////////// std::string lookup_peer(nuraft_mesg::peer_id_t const&) override; diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 4393b13d5..92ff45a69 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -17,6 +17,8 @@ */ #pragma once +#include "raft_repl_test_base.hpp" + #include #include #include @@ -35,6 +37,8 @@ #include #include "test_common/homestore_test_common.hpp" +#include + SISL_OPTION_GROUP(test_repl_common_setup, (replicas, "", "replicas", "Total number of replicas", ::cxxopts::value< uint32_t >()->default_value("3"), "number"), @@ -298,6 +302,19 @@ class HSReplTestHelper : public HSTestHelper { auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get(); ASSERT_EQ(v.hasValue(), true) << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str(); + auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); + auto follower_priority = raftService.compute_raft_follower_priority(); + auto repl_dev = v.value(); + ASSERT_EQ(my_replica_id_, repl_dev->get_leader_id()); + auto peer_info = repl_dev->get_replication_status(); + for (auto pinfo : peer_info) { + LOGINFO("Replica={} has priority={}", boost::uuids::to_string(pinfo.id_), pinfo.priority_); + if (pinfo.id_ == my_replica_id_) { + ASSERT_EQ(raft_leader_priority, pinfo.priority_); + } else { + ASSERT_EQ(follower_priority, pinfo.priority_); + } + } } } diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 6e21a64e8..f6d458943 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -484,6 +484,80 @@ TEST_F(RaftReplDevTest, LargeDataWrite) { g_helper->sync_for_cleanup_start(); } +TEST_F(RaftReplDevTest, PriorityLeaderElection) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >(); + if (g_helper->replica_num() == 0) { + auto leader = this->wait_and_get_leader_id(); + ASSERT_EQ(leader, g_helper->my_replica_id()); + } + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); + + LOGINFO("Restart leader"); + if (g_helper->replica_num() == 0) { g_helper->restart_homestore(); } + g_helper->sync_for_test_start(); + + LOGINFO("Validate leader switched"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + auto leader = this->wait_and_get_leader_id(); + if (g_helper->replica_num() == 0) { ASSERT_NE(leader, g_helper->my_replica_id()); } + g_helper->sync_for_verify_start(); + + if (leader == g_helper->my_replica_id()) { + LOGINFO("Resign and trigger a priority leader election"); + // resign and trigger a priority leader election + g_helper->restart_homestore(); + } + g_helper->sync_for_test_start(); + + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + leader = this->wait_and_get_leader_id(); + LOGINFO("Validate leader switched back to initial replica"); + if (g_helper->replica_num() == 0) { ASSERT_EQ(leader, g_helper->my_replica_id()); } + g_helper->sync_for_verify_start(); + + LOGINFO("Post restart write the data again on the leader"); + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + LOGINFO("Validate all data written (including pre-restart data) by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); +} + +TEST_F(RaftReplDevTest, ComputePriority) { + g_helper->sync_for_test_start(); + auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); + + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 0; }); + HS_SETTINGS_FACTORY().save(); + ASSERT_EQ(raftService.compute_raft_follower_priority(), raft_leader_priority); + + for (auto i = 1; i <= int(raft_priority_election_round_upper_limit); i++) { + HS_SETTINGS_FACTORY().modifiable_settings( + [i](auto& s) { s.consensus.max_wait_rounds_of_priority_election = i; }); + HS_SETTINGS_FACTORY().save(); + auto follower_priority = raftService.compute_raft_follower_priority(); + // Simulate nuraft algorithm + auto decayed_priority = raft_leader_priority; + for (auto j = 1; j <= i; j++) { + int gap = std::max((int)10, decayed_priority / 5); + decayed_priority = std::max(1, decayed_priority - gap); + } + LOGINFO("Follower priority={} decayed_priority={}", follower_priority, decayed_priority); + ASSERT_TRUE(follower_priority >= decayed_priority); + } + // Set back to default value + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 2; }); + HS_SETTINGS_FACTORY().save(); + g_helper->sync_for_cleanup_start(); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; From f07eca2dafc2ed38d129f86c81d256da082238d4 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Fri, 25 Apr 2025 18:22:10 +0800 Subject: [PATCH 104/130] add traceid for replace member --- conanfile.py | 2 +- src/include/homestore/replication_service.hpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 8 +++----- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- src/lib/replication/service/generic_repl_svc.cpp | 3 ++- src/lib/replication/service/generic_repl_svc.h | 3 ++- src/lib/replication/service/raft_repl_service.cpp | 7 +++++-- src/lib/replication/service/raft_repl_service.h | 3 ++- 8 files changed, 17 insertions(+), 13 deletions(-) diff --git a/conanfile.py b/conanfile.py index f73e1da09..605f5f84d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.2" + version = "6.12.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 448bb9afe..23ee2422c 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -43,7 +43,7 @@ class ReplicationService { virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, - uint32_t commit_quorum = 0) const = 0; + uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 88db1263b..f9f9b96e6 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -137,12 +137,10 @@ bool RaftReplDev::join_group() { } AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum) { - // Fixme: traceID for replace member - uint64_t trace_id = 0; - + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); + LOGINFO("repl dev is being shutdown! trace_id={}", trace_id); return make_async_error<>(ReplServiceError::STOPPING); } incr_pending_request_num(); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 19b672e7b..0dec2c45b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -225,7 +225,7 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); AsyncReplResult<> replace_member(const replica_member_info& member_out, const replica_member_info& member_in, - uint32_t commit_quorum); + uint32_t commit_quorum, uint64_t trace_id = 0); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 4c76bef75..082ef746b 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -193,7 +193,8 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum) const { + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index acdff7bd4..8fc33064c 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -74,7 +74,8 @@ class SoloReplService : public GenericReplService { folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index bda83e600..e434f716b 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -458,12 +458,15 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum) const { + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->replace_member(member_out, member_in, commit_quorum) + ->replace_member(member_out, member_in, commit_quorum, trace_id) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { return make_async_error<>(e.error()); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 84affd264..27bad10f0 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -78,7 +78,8 @@ class RaftReplService : public GenericReplService, folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); From 7ba29b7c046b4536f0e132c65bd1dd112ac4113d Mon Sep 17 00:00:00 2001 From: Yaming Kuang Date: Mon, 28 Apr 2025 11:12:18 -0700 Subject: [PATCH 105/130] adopt api signature change --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 605f5f84d..0a50658d5 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.3" + version = "6.12.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f9f9b96e6..13064c954 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1743,7 +1743,7 @@ void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { void RaftReplDev::pause_statemachine() { if (!raft_server()->is_state_machine_execution_paused()) { - raft_server()->pause_state_machine_exeuction(); + raft_server()->pause_state_machine_execution(); while (!raft_server()->wait_for_state_machine_pause(100)) { RD_LOGD(NO_TRACE_ID, "wait for statemachine pause!"); } From 1a669fe4e3abb398e9cc7b4f2850814147218d66 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 29 Apr 2025 12:43:34 +0800 Subject: [PATCH 106/130] support handling config rollback and add periodical notification of the lastest committed lsn to upper layer (#703) --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 20 +++++++++----- .../replication/repl_dev/raft_repl_dev.cpp | 27 +++++++------------ src/lib/replication/repl_dev/raft_repl_dev.h | 5 +--- .../repl_dev/raft_state_machine.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 4 --- src/tests/test_common/raft_repl_test_base.hpp | 12 +++++++++ src/tests/test_solo_repl_dev.cpp | 3 +++ 8 files changed, 42 insertions(+), 33 deletions(-) diff --git a/conanfile.py b/conanfile.py index 0a50658d5..83709b445 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.4" + version = "6.13.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index c0ac700fd..dfe4f1122 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -270,6 +270,14 @@ class ReplDevListener { virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief periodically called to notify the lastest committed lsn to the listener. + /// NOTE: this callback will block the thread of flushing the latest committed lsn into repl_dev superblk as DC_LSN, + /// pls take care if there is any heavy or blocking operation in this callback. + /// + /// @param lsn - The lasted committed log sequence number so far + /// + virtual void notify_committed_lsn(int64_t lsn) = 0; + /// @brief Called when the log entry has been received by the replica dev. /// /// On recovery, this is called from a random worker thread before the raft server is started. It is @@ -307,6 +315,10 @@ class ReplDevListener { virtual void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief Called when the config log entry has been rolled back. + /// @param lsn - The log sequence number getting rolled back + virtual void on_config_rollback(int64_t lsn) = 0; + /// @brief Called when the replDev is created after restart. The consumer is expected to recover all the modules /// necessary to replay/commit the logs. virtual void on_restart() = 0; @@ -385,10 +397,10 @@ class ReplDevListener { /// @brief ask upper layer to handle no_space_left event // @param lsn - on which repl_lsn no_space_left happened // @param chunk_id - on which chunk no_space_left happened - virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) { return; } + virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0; /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id) {}; + virtual void on_log_replay_done(const group_id_t& group_id){}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -505,10 +517,6 @@ class ReplDev { } } - // pause/resume statemachine(commiting thread) - virtual void pause_statemachine() = 0; - virtual void resume_statemachine() = 0; - // complete all the requests that are in progress and start refusing new reqs virtual void quiesce_reqs() = 0; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 13064c954..d005e5c9d 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1036,6 +1036,13 @@ void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config } } +void RaftReplDev::handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& conf) { + RD_LOGD(NO_TRACE_ID, "roll back config on lsn {}", lsn); + // keep this variable in case it is needed later + (void)conf; + m_listener->on_config_rollback(lsn); +} + void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); @@ -1444,12 +1451,14 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } void RaftReplDev::flush_durable_commit_lsn() { + auto const lsn = m_commit_upto_lsn.load(); + m_listener->notify_committed_lsn(lsn); + if (is_destroyed()) { RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore flush durable commit lsn"); return; } - auto const lsn = m_commit_upto_lsn.load(); RD_LOGT(NO_TRACE_ID, "Flushing durable commit lsn to {}", lsn); std::unique_lock lg{m_sb_mtx}; m_rd_sb->durable_commit_lsn = lsn; @@ -1741,22 +1750,6 @@ void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { } } -void RaftReplDev::pause_statemachine() { - if (!raft_server()->is_state_machine_execution_paused()) { - raft_server()->pause_state_machine_execution(); - while (!raft_server()->wait_for_state_machine_pause(100)) { - RD_LOGD(NO_TRACE_ID, "wait for statemachine pause!"); - } - } -} - -void RaftReplDev::resume_statemachine() { - if (raft_server()->is_state_machine_execution_paused()) { - raft_server()->resume_state_machine_execution(); - RD_LOGD(NO_TRACE_ID, "statemachine is resumed!"); - } -} - void RaftReplDev::quiesce_reqs() { // all the block allocation happens in rreq->init. so after we wait for all the pending req has been initialized we // can make sure diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0dec2c45b..bd6a6c448 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -277,6 +277,7 @@ class RaftReplDev : public ReplDev, void handle_commit(repl_req_ptr_t rreq, bool recovery = false); void handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf); void handle_rollback(repl_req_ptr_t rreq); + void handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& old_conf); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, bool is_data_channel, @@ -356,10 +357,6 @@ class RaftReplDev : public ReplDev, */ bool need_skip_processing(const repl_lsn_t lsn) { return lsn <= m_rd_sb->last_snapshot_lsn; } - // pause/resume statemachine(commiting thread) - void pause_statemachine(); - void resume_statemachine(); - void quiesce_reqs(); void resume_accepting_reqs(); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index b5f9099f4..b2cce85bb 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -244,7 +244,7 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { RD_LOGD(NO_TRACE_ID, "Raft channel: Rollback cluster conf , log_idx = {}", log_idx); - // TODO:add more logic here if necessary + m_rd.handle_config_rollback(s_cast< repl_lsn_t >(log_idx), conf); } void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& params) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 80bfc54bf..5174e5edd 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -77,10 +77,6 @@ class SoloReplDev : public ReplDev { uint32_t get_blk_size() const override; - // pause/resume statemachine(commiting thread) - void pause_statemachine() override { return; } - void resume_statemachine() override { return; } - void quiesce_reqs() override { return; } void resume_accepting_reqs() override { return; } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 6a4be1b41..0dbd539e3 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -176,6 +176,18 @@ class TestReplicatedDB : public homestore::ReplDevListener { g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper< ReplServiceError >(error)); } + void notify_committed_lsn(int64_t lsn) override { + LOGINFOMOD(replication, "[Replica={}] Received notify_committed_lsn={}", g_helper->replica_num(), lsn); + } + + void on_config_rollback(int64_t lsn) override { + LOGINFOMOD(replication, "[Replica={}] Received config rollback at lsn={}", g_helper->replica_num(), lsn); + } + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override { + LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}, chunk_id={}", g_helper->replica_num(), + lsn, chunk_id); + } + AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { std::lock_guard< std::mutex > lock(m_snapshot_lock); auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 9c891f0a0..dbfd304cf 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -140,6 +140,9 @@ class SoloReplDevTest : public testing::Test { } void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} void on_destroy(const group_id_t& group_id) override {} + void notify_committed_lsn(int64_t lsn) override {} + void on_config_rollback(int64_t lsn) override {} + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override {} }; class Application : public ReplApplication { From 58586c4b7762aca5acdf471d5b819ece32a6fc0d Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 28 Apr 2025 16:09:41 +0800 Subject: [PATCH 107/130] fix: init rkey with trace id --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 83709b445..b517fe2de 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.1" + version = "6.13.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index d005e5c9d..b2696d19b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -332,7 +332,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } auto status = init_req_ctx( - rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), .traceID = tid}, data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, key, data.size, m_listener); From 3d87d6c347965b0825a24d8640c1b236e53e6df0 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:53:44 -0700 Subject: [PATCH 108/130] Bump up hub.tess.io/sds/sds_develop in DockerFile (#709) --- .jenkins/Dockerfile | 2 +- conanfile.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.jenkins/Dockerfile b/.jenkins/Dockerfile index 20c4489b0..dcfdd9d65 100644 --- a/.jenkins/Dockerfile +++ b/.jenkins/Dockerfile @@ -1,5 +1,5 @@ # ########## ####### ############ -FROM hub.tess.io/sds/sds_develop:4.x-latest +FROM hub.tess.io/sds/sds_develop:7.x-latest LABEL description="Automated HomeStore compilation" WORKDIR /output diff --git a/conanfile.py b/conanfile.py index b517fe2de..fa63d4d8b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.2" + version = "6.13.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From 30f6b6808cdd29c1b6bb0241e5495fa77796e7e1 Mon Sep 17 00:00:00 2001 From: Brian Szmyd Date: Wed, 30 Apr 2025 17:26:44 -0600 Subject: [PATCH 109/130] Move sanitizer builds to its own location. (#710) --- conanfile.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index fa63d4d8b..ab9c470b8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -65,7 +65,12 @@ def imports(self): def layout(self): self.folders.source = "." - self.folders.build = join("build", str(self.settings.build_type)) + if self.options.get_safe("sanitize"): + self.folders.build = join("build", "Sanitized") + elif self.options.get_safe("coverage"): + self.folders.build = join("build", "Coverage") + else: + self.folders.build = join("build", str(self.settings.build_type)) self.folders.generators = join(self.folders.build, "generators") self.cpp.source.includedirs = ["src/include"] From 7e64bd554014f6b6acce8538ee1ddef5afa365d1 Mon Sep 17 00:00:00 2001 From: Sanal Date: Thu, 1 May 2025 10:09:18 -0700 Subject: [PATCH 110/130] Add async_write, alloc blks for solo repl dev. (#706) Add support for async write data, journal, alloc blks for solo repl dev. Raft repl dev doesnt support these operations. This is needed for nublocks where it need to write free blkids also to the journal. Free blocks are obtained after writing the new blkids to index. Add apis for allocation and write for vector of blkids . Raft repldev currently uses only a single blkid. Test solo repl dev changes to support vector of blkids. --- conanfile.py | 2 +- src/include/homestore/blkdata_service.hpp | 26 ++- src/include/homestore/replication/repl_dev.h | 57 +++++- src/lib/blkdata_svc/blkdata_service.cpp | 42 +++- src/lib/replication/repl_dev/common.cpp | 41 +++- .../replication/repl_dev/raft_repl_dev.cpp | 13 +- src/lib/replication/repl_dev/raft_repl_dev.h | 18 ++ .../replication/repl_dev/solo_repl_dev.cpp | 102 +++++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 9 + src/tests/test_solo_repl_dev.cpp | 179 ++++++++++++------ 10 files changed, 399 insertions(+), 90 deletions(-) diff --git a/conanfile.py b/conanfile.py index ab9c470b8..3699ed10c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.4" + version = "6.13.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index fff670f44..786375d4f 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -114,6 +114,18 @@ class BlkDataService { folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, MultiBlkId const& in_blkids, bool part_of_batch = false); + /** + * @brief : asynchronous write with input block ids; + * + * @param sgs : the data buffer that needs to be written + * @param hints : blk alloc hints + * @param in_blkids : input block ids that this write should be written to; + * @param cb : callback that will be triggered after write completes + * @param part_of_batch : is this write part of a batch; + */ + folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& in_blkids, + bool part_of_batch = false); + /** * @brief Asynchronously reads data from the specified block ID into the provided buffer. * @@ -147,7 +159,8 @@ class BlkDataService { BlkAllocStatus commit_blk(MultiBlkId const& bid); /** - * @brief Allocates a contiguous block of disk space of the given size. + * @brief Allocates a contiguous block of disk space of the given size. This API should be called that when consumer + * is expecting blks only allocated on same chunk. * * @param size The size of the block to allocate, in bytes. * @param hints Hints for how to allocate the block. @@ -156,6 +169,17 @@ class BlkDataService { */ BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, MultiBlkId& out_blkids); + /** + * @brief Allocates blocks of disk space of the given size.This API should be called when consumer is expecting blk + * allocation happen on different chunks is possible and acceptable. + * + * @param size The size of the block to allocate, in bytes. + * @param hints Hints for how to allocate the block. + * @param out_blkids Output parameter that will be filled with the IDs of the allocated blocks. + * @return The status of the block allocation attempt. + */ + BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, std::vector< BlkId >& out_blkids); + /** * @brief Asynchronously frees the specified block IDs. * It is asynchronous because it might need to wait for pending read to complete if same block is being read and not diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index dfe4f1122..d6caf2711 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -130,7 +130,16 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: sisl::blob const& header() const { return m_header; } sisl::blob const& key() const { return m_key; } - MultiBlkId const& local_blkid() const { return m_local_blkid; } + MultiBlkId const& local_blkid() const { + // Currently used by raft repl dev only where a single blob is expected. + // Code checks if its a valid blkid so return a dummy blkid. + if (!m_local_blkids.empty()) + return m_local_blkids[0]; + else + return dummy_blkid; + } + + std::vector< MultiBlkId >& local_blkids() { return m_local_blkids; } RemoteBlkId const& remote_blkid() const { return m_remote_blkid; } const char* data() const { DEBUG_ASSERT(m_data != nullptr, @@ -141,6 +150,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool has_state(repl_req_state_t s) const { return m_state.load() & uint32_cast(s); } repl_journal_entry const* journal_entry() const { return m_journal_entry; } uint32_t journal_entry_size() const; + uint32_t blkids_serialized_size() const; bool is_localize_pending() const { return m_is_jentry_localize_pending; } bool has_linked_data() const { return (m_op_code == journal_type_t::HS_DATA_LINKED); } @@ -149,6 +159,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: /////////////////////// Non modifiers methods ////////////////// std::string to_string() const; std::string to_compact_string() const; + std::string blkids_to_string() const; Clock::time_point created_time() const { return m_start_time; } void set_created_time() { m_start_time = Clock::now(); } bool is_expired() const; @@ -195,7 +206,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool save_fetched_data(sisl::GenericClientResponse const& fetched_data, uint8_t const* data, uint32_t data_size); void set_remote_blkid(RemoteBlkId const& rbid) { m_remote_blkid = rbid; } - void set_local_blkid(MultiBlkId const& lbid) { m_local_blkid = lbid; } // Only used during recovery + void set_local_blkids(std::vector< MultiBlkId > const& lbids) { m_local_blkids = std::move(lbids); } void set_is_volatile(bool is_volatile) { m_is_volatile.store(is_volatile); } void set_lsn(int64_t lsn); void add_state(repl_req_state_t s); @@ -226,9 +237,10 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: std::atomic< bool > m_is_volatile{true}; // Is the log still in memory and not flushed to disk yet /////////////// Data related section ///////////////// - MultiBlkId m_local_blkid; // Local BlkId for the data - RemoteBlkId m_remote_blkid; // Corresponding remote blkid for the data - uint8_t const* m_data; // Raw data pointer containing the actual data + static inline MultiBlkId dummy_blkid; + std::vector< MultiBlkId > m_local_blkids; // Local BlkId for the data + RemoteBlkId m_remote_blkid; // Corresponding remote blkid for the data + uint8_t const* m_data; // Raw data pointer containing the actual data /////////////// Journal/Buf related section ///////////////// std::variant< std::unique_ptr< uint8_t[] >, raft_buf_ptr_t > m_journal_buf; // Buf for the journal entry @@ -400,7 +412,7 @@ class ReplDevListener { virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0; /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id){}; + virtual void on_log_replay_done(const group_id_t& group_id) {}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -411,6 +423,39 @@ class ReplDev { ReplDev() = default; virtual ~ReplDev() { detach_listener(); } + /// @brief Allocates blkids from the storage engine to write the value into. Storage + /// engine returns a blkid_list in cases where single contiguous blocks are not + /// available. + /// + /// @param data_size - Size of the data. + /// @param hints - Specify block allocation hints. + /// @param out_blkids - List of bilkid's which may not be contiguous. + virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) = 0; + + /// @brief Write data locally using the specified blkid's. Data is split across the blkids. + /// @param blkids - List of blkid's where data will be written. + /// @param value - vector of io buffers that contain value for the key. + /// @param part_of_batch - Is write is part of a batch. If part of the batch, then submit_batch needs to be called + /// at the end + /// @return A Future with std::error_code to notify if it has successfully write the data or any error code in case + /// of failure + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) = 0; + + /// @brief Creates a log/journal entry with and calls the on_commit listener callback. + /// @param blkids - List of blkid's where data was written. + /// @param header - Blob representing the header (it is opaque and will be copied + /// as-is to the journal entry) + /// @param key - Blob representing the key (it is opaque and will be copied as-is to + /// the journal entry). + /// @param data_size - Size of the data. + /// @param ctx - User supplied context which will be passed to listener callbacks + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) = 0; + /// @brief Replicate the data to the replica set. This method goes through the /// following steps: /// Step 1: Allocates blkid from the storage engine to write the value into. Storage diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 5e80ac7e0..1decfb2a2 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -189,8 +189,35 @@ folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const } } +folly::Future< std::error_code > +BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& blkids, bool part_of_batch) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); + for (const auto& blkid : blkids) { + s_futs.emplace_back(async_write(sgs, blkid, part_of_batch)); + } + decr_pending_request_num(); + return collect_all_futures(s_futs); +} + BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { - HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested"); + if (is_stopping()) return BlkAllocStatus::FAILED; + incr_pending_request_num(); + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); + blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); + + auto ret = m_vdev->alloc_blks(nblks, hints, out_blkids); + decr_pending_request_num(); + return ret; +} + +BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, + std::vector< BlkId >& out_blkids) { + if (is_stopping()) return BlkAllocStatus::FAILED; + incr_pending_request_num(); + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); return m_vdev->alloc_blks(nblks, hints, out_blkids); @@ -235,6 +262,19 @@ void BlkDataService::start() { std::move(std::make_unique< DataSvcCPCallbacks >(m_vdev))); } +void BlkDataService::stop() { + start_stopping(); + // we have no way to track the completion of each async io in detail which should be done in iomanager level, so + // we just wait for 3 seconds, and we expect each io will be completed within this time. + + // TODO: find a better solution to track the completion of these aysnc calls + std::this_thread::sleep_for(std::chrono::milliseconds(3000)); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } +} + uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 6a39256f9..2782a36a5 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -63,7 +63,7 @@ repl_req_ctx::~repl_req_ctx() { } void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { - uint32_t val_size = has_linked_data() ? m_local_blkid.serialized_size() : 0; + uint32_t val_size = has_linked_data() ? blkids_serialized_size() : 0; uint32_t entry_size = sizeof(repl_journal_entry) + m_header.size() + m_key.size() + val_size; if (is_raft_buf) { @@ -94,14 +94,25 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { } if (has_linked_data()) { - auto const b = m_local_blkid.serialize(); - std::memcpy(raw_ptr, b.cbytes(), b.size()); + for (const auto& blkid : m_local_blkids) { + auto const b = blkid.serialize(); + std::memcpy(raw_ptr, b.cbytes(), b.size()); + raw_ptr += b.size(); + } } } uint32_t repl_req_ctx::journal_entry_size() const { return sizeof(repl_journal_entry) + m_header.size() + m_key.size() + - (has_linked_data() ? m_local_blkid.serialized_size() : 0); + (has_linked_data() ? blkids_serialized_size() : 0); +} + +uint32_t repl_req_ctx::blkids_serialized_size() const { + uint32_t blkids_serialized_size = 0; + for (const auto& blkid : m_local_blkids) { + blkids_serialized_size += blkid.serialized_size(); + } + return blkids_serialized_size; } void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_hdr_key) { @@ -128,7 +139,7 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list // if the committed_blk_id is already present, use it and skip allocation and commitment LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, rkey().to_string()); - m_local_blkid = hints_result.value().committed_blk_id.value(); + m_local_blkids.emplace_back(hints_result.value().committed_blk_id.value()); add_state(repl_req_state_t::BLK_ALLOCATED); add_state(repl_req_state_t::DATA_RECEIVED); add_state(repl_req_state_t::DATA_WRITTEN); @@ -138,14 +149,19 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list return ReplServiceError::OK; } + std::vector< BlkId > blkids; auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), - hints_result.value(), m_local_blkid); + hints_result.value(), blkids); if (status != BlkAllocStatus::SUCCESS) { LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } + + for (auto& blkid : blkids) { + m_local_blkids.emplace_back(blkid); + } add_state(repl_req_state_t::BLK_ALLOCATED); return ReplServiceError::OK; } @@ -246,7 +262,7 @@ std::string repl_req_ctx::to_string() const { return fmt::format("repl_key=[{}], lsn={} state=[{}] m_headersize={} m_keysize={} is_proposer={} " "local_blkid={} remote_blkid={}", m_rkey.to_string(), m_lsn, req_state_name(uint32_cast(state())), m_header.size(), m_key.size(), - m_is_proposer, m_local_blkid.to_string(), m_remote_blkid.blkid.to_string()); + m_is_proposer, blkids_to_string(), m_remote_blkid.blkid.to_string()); } std::string repl_req_ctx::to_compact_string() const { @@ -255,7 +271,16 @@ std::string repl_req_ctx::to_compact_string() const { } return fmt::format("dsn={} term={} lsn={} op={} local_blkid={} state=[{}]", m_rkey.dsn, m_rkey.term, m_lsn, - enum_name(m_op_code), m_local_blkid.to_string(), req_state_name(uint32_cast(state()))); + enum_name(m_op_code), blkids_to_string(), req_state_name(uint32_cast(state()))); +} + +std::string repl_req_ctx::blkids_to_string() const { + std::string str = fmt::format("["); + for (const auto& blkid : m_local_blkids) { + fmt::format_to(std::back_inserter(str), "{} ", blkid.to_string()); + } + fmt::format_to(std::back_inserter(str), "]"); + return str; } bool repl_req_ctx::is_expired() const { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b2696d19b..dd42dd4cd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -331,10 +331,13 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = init_req_ctx( - rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), .traceID = tid}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, - key, data.size, m_listener); + auto status = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = tid}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, + true /* is_proposer */, header, key, data.size, m_listener); if (status != ReplServiceError::OK) { RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); @@ -1638,7 +1641,7 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); data_size = entry_blkid.blk_count() * get_blk_size(); - rreq->set_local_blkid(entry_blkid); + rreq->set_local_blkids({entry_blkid}); rreq->add_state(repl_req_state_t::BLK_ALLOCATED); rreq->add_state(repl_req_state_t::DATA_RECEIVED); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index bd6a6c448..42d100ebb 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -229,6 +229,24 @@ class RaftReplDev : public ReplDev, folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// + virtual std::error_code alloc_blks(uint32_t size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + return std::make_error_code(std::errc::operation_not_supported); + } + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_not_supported)); + } + + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + } + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 7c57ef322..131c09abb 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -39,7 +39,7 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& // If it is header only entry, directly write to the journal if (rreq->has_linked_data() && !rreq->has_state(repl_req_state_t::DATA_WRITTEN)) { // Write the data - data_service().async_write(value, rreq->local_blkid()).thenValue([this, rreq = std::move(rreq)](auto&& err) { + data_service().async_write(value, rreq->local_blkids()).thenValue([this, rreq = std::move(rreq)](auto&& err) { HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener write_journal(std::move(rreq)); }); @@ -58,12 +58,92 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - data_service().commit_blk(rreq->local_blkid()); - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); + for (const auto& blkid : rreq->local_blkids()) { + data_service().commit_blk(blkid); + } + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkids(), rreq); decr_pending_request_num(); }); } +std::error_code SoloReplDev::alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) { + if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); } + + incr_pending_request_num(); + std::vector< BlkId > blkids; + auto status = + data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints, blkids); + if (status != BlkAllocStatus::SUCCESS) { + DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); + decr_pending_request_num(); + return std::make_error_code(std::errc::no_space_on_device); + } + for (auto& blkid : blkids) { + out_blkids.emplace_back(blkid); + } + decr_pending_request_num(); + return std::error_code{}; +} + +folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch, + trace_id_t tid) { + if (is_stopping()) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } + + incr_pending_request_num(); + HS_REL_ASSERT_GT(blkids.size(), 0, "Empty blkid vec"); + std::vector< folly::Future< std::error_code > > futs; + futs.reserve(blkids.size()); + sisl::sg_iterator sg_it{value.iovs}; + + for (const auto& blkid : blkids) { + auto sgs_size = blkid.blk_count() * data_service().get_blk_size(); + const auto iovs = sg_it.next_iovs(sgs_size); + uint32_t total_size = 0; + for (auto& iov : iovs) { + total_size += iov.iov_len; + } + if (total_size != sgs_size) { + LOGINFO("Block size mismatch total_size={} sgs_size={}", total_size, sgs_size); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::invalid_argument)); + } + sisl::sg_list sgs{sgs_size, iovs}; + futs.emplace_back(data_service().async_write(sgs, blkid, part_of_batch)); + } + + return folly::collectAllUnsafe(futs).thenValue([this](auto&& v_res) { + for (const auto& err_c : v_res) { + if (sisl_unlikely(err_c.value())) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::io_error)); + } + } + + decr_pending_request_num(); + return folly::makeFuture< std::error_code >(std::error_code{}); + }); +} + +void SoloReplDev::async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t rreq, trace_id_t tid) { + if (is_stopping()) { return; } + incr_pending_request_num(); + + // We expect clients to provide valid repl req ctx with blocks allocated. + HS_REL_ASSERT(rreq, "Invalid repl req ctx"); + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->set_local_blkids(blkids); + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, + data_size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, + key, data_size, m_listener); + HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in initializing repl req context."); + + // Write to journal. + write_journal(std::move(rreq)); +} + void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { repl_journal_entry const* entry = r_cast< repl_journal_entry const* >(buf.bytes()); uint32_t remain_size = buf.size() - sizeof(repl_journal_entry); @@ -81,22 +161,27 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx raw_ptr += entry->key_size; remain_size -= entry->key_size; - sisl::blob value_blob{raw_ptr, remain_size}; - MultiBlkId blkid; - if (remain_size) { blkid.deserialize(value_blob, true /* copy */); } + std::vector< MultiBlkId > blkids; + while (remain_size > 0) { + MultiBlkId blkid; + sisl::blob value_blob{raw_ptr, sizeof(BlkId)}; + blkid.deserialize(value_blob, true /* copy */); + raw_ptr += sizeof(BlkId); + remain_size -= sizeof(BlkId); + blkids.push_back(blkid); + } m_listener->on_pre_commit(lsn, header, key, nullptr); auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - m_listener->on_commit(lsn, header, key, {blkid}, nullptr); + m_listener->on_commit(lsn, header, key, blkids, nullptr); } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch, trace_id_t tid) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); } incr_pending_request_num(); @@ -107,7 +192,6 @@ folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); } incr_pending_request_num(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 5174e5edd..35f089ec5 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -39,6 +39,15 @@ class SoloReplDev : public ReplDev { SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~SoloReplDev() = default; + virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) override; + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) override; + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) override; + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index dbfd304cf..4310d81de 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -62,22 +62,15 @@ struct test_repl_req : public repl_req_ctx { sisl::byte_array header; sisl::byte_array key; sisl::sg_list write_sgs; - sisl::sg_list read_sgs; - MultiBlkId written_blkids; + std::vector< MultiBlkId > written_blkids; - test_repl_req() { - write_sgs.size = 0; - read_sgs.size = 0; - } + test_repl_req() { write_sgs.size = 0; } ~test_repl_req() { for (auto const& iov : write_sgs.iovs) { iomanager.iobuf_free(uintptr_cast(iov.iov_base)); } - - for (auto const& iov : read_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } } + struct journal_header { uint32_t key_size; uint64_t key_pattern; @@ -99,12 +92,11 @@ class SoloReplDevTest : public testing::Test { void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received on_commit lsn={}", lsn); - HS_REL_ASSERT(!blkids.empty(), "Invalid blkids size"); if (ctx == nullptr) { - m_test.validate_replay(*repl_dev(), lsn, header, key, blkids[0]); + m_test.validate_replay(*repl_dev(), lsn, header, key, blkids); } else { auto req = boost::static_pointer_cast< test_repl_req >(ctx); - req->written_blkids = blkids[0]; + req->written_blkids = std::move(blkids); m_test.on_write_complete(*repl_dev(), req); } } @@ -230,60 +222,116 @@ class SoloReplDevTest : public testing::Test { rdev->async_alloc_write(*req->header, req->key ? *req->key : sisl::blob{}, req->write_sgs, req); } + void async_write_data_and_journal(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) { + data_size = data_size == 0 ? g_block_size : data_size; + auto req = intrusive< test_repl_req >(new test_repl_req()); + req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header)); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); + hdr->key_size = key_size; + hdr->key_pattern = ((long long)rand() << 32) | rand(); + hdr->data_size = data_size; + hdr->data_pattern = ((long long)rand() << 32) | rand(); + + if (key_size != 0) { + req->key = sisl::make_byte_array(key_size); + HSTestHelper::fill_data_buf(req->key->bytes(), key_size, hdr->key_pattern); + } + + req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern); + + auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; + + auto const cap = hs()->repl_service().get_cap_stats(); + LOGDEBUG("Before write, cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + + std::vector< MultiBlkId > blkids; + blk_alloc_hints hints; + auto err = rdev->alloc_blks(data_size, hints, blkids); + RELEASE_ASSERT(!err, "Error during alloc_blks"); + RELEASE_ASSERT(!blkids.empty(), "Empty blkids"); + + rdev->async_write(blkids, req->write_sgs).thenValue([this, rdev, blkids, data_size, req](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_write"); + rdev->async_write_journal(blkids, *req->header, req->key ? *req->key : sisl::blob{}, data_size, req); + }); + } + void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key, - MultiBlkId const& blkids) { + std::vector< MultiBlkId > const& blkids) { + if (blkids.empty()) { + m_task_waiter.one_complete(); + return; + } + auto const jhdr = r_cast< test_repl_req::journal_header const* >(header.cbytes()); HSTestHelper::validate_data_buf(key.cbytes(), key.size(), jhdr->key_pattern); - - uint32_t size = blkids.blk_count() * g_block_size; - if (size) { - auto read_sgs = HSTestHelper::create_sgs(size, size); - LOGINFO("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, - blkids.to_string()); - rdev.async_read(blkids, read_sgs, size) - .thenValue([this, hdr = *jhdr, read_sgs, lsn, blkids, &rdev](auto&& err) { - RELEASE_ASSERT(!err, "Error during async_read"); - HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size, "journal hdr data size mismatch with actual size"); - - for (auto const& iov : read_sgs.iovs) { - HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - LOGINFO("[{}] Replay of lsn={} blkid={} validated successfully", - boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); - m_task_waiter.one_complete(); - }); - } else { - m_task_waiter.one_complete(); + uint64_t total_io = blkids.size(); + auto io_count = std::make_shared< std::atomic< uint64_t > >(0); + for (const auto& blkid : blkids) { + uint32_t size = blkid.blk_count() * g_block_size; + if (size) { + auto read_sgs = HSTestHelper::create_sgs(size, size); + LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, + blkid.to_string()); + rdev.async_read(blkid, read_sgs, size) + .thenValue([this, io_count, total_io, hdr = *jhdr, read_sgs, lsn, blkid, &rdev](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_read"); + // HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size, + // "journal hdr data size mismatch with actual size"); + + for (auto const& iov : read_sgs.iovs) { + HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully", + boost::uuids::to_string(rdev.group_id()), lsn, blkid.to_string()); + + io_count->fetch_add(1); + if (*io_count == total_io) { m_task_waiter.one_complete(); } + }); + } else { + m_task_waiter.one_complete(); + } } } void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) { - // If we did send some data to the repl_dev, validate it by doing async_read - if (req->write_sgs.size != 0) { - req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size); - - auto const cap = hs()->repl_service().get_cap_stats(); - LOGINFO("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); - - rdev.async_read(req->written_blkids, req->read_sgs, req->read_sgs.size) - .thenValue([this, &rdev, req](auto&& err) { - RELEASE_ASSERT(!err, "Error during async_read"); - - LOGINFO("[{}] Write complete with lsn={} for size={} blkids={}", - boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size, - req->written_blkids.to_string()); - auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); - HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size, - "journal hdr data size mismatch with actual size"); - - for (auto const& iov : req->read_sgs.iovs) { - HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); - } - m_io_runner.next_task(); - }); - } else { + if (req->written_blkids.empty()) { m_io_runner.next_task(); + return; + } + + // If we did send some data to the repl_dev, validate it by doing async_read + auto io_count = std::make_shared< std::atomic< uint64_t > >(0); + for (const auto blkid : req->written_blkids) { + if (req->write_sgs.size != 0) { + auto const cap = hs()->repl_service().get_cap_stats(); + LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + + auto sgs_size = blkid.blk_count() * g_block_size; + auto read_sgs = HSTestHelper::create_sgs(sgs_size, sgs_size); + rdev.async_read(blkid, read_sgs, read_sgs.size) + .thenValue([this, io_count, blkid, &rdev, sgs_size, read_sgs, req](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_read"); + + LOGINFO("[{}] Write complete with lsn={} for size={} blkid={}", + boost::uuids::to_string(rdev.group_id()), req->lsn(), sgs_size, blkid.to_string()); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); + // HS_REL_ASSERT_EQ(hdr->data_size, read_sgs.size, + // "journal hdr data size mismatch with actual size"); + + for (auto const& iov : read_sgs.iovs) { + LOGDEBUG("Read data blkid={} len={} data={}", blkid.to_integer(), iov.iov_len, + *(uint64_t*)iov.iov_base); + HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + io_count->fetch_add(1); + if (*io_count == req->written_blkids.size()) { m_io_runner.next_task(); } + }); + } else { + m_io_runner.next_task(); + } } } }; @@ -318,6 +366,19 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) { this->m_task_waiter.start([this]() { this->restart(); }).get(); } +TEST_F(SoloReplDevTest, TestAsyncWriteJournal) { + LOGINFO("Step 1: run on worker threads to schedule write for random bytes ranging {}-{}.", 0, 1 * Mi); + this->m_io_runner.set_task([this]() { + uint32_t nblks = rand() % ((1 * Mi) / g_block_size); + uint32_t key_size = rand() % 512 + 8; + this->async_write_data_and_journal(key_size, nblks * g_block_size, g_block_size); + }); + + this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); + this->m_task_waiter.start([this]() { this->restart(); }).get(); +} + SISL_OPTION_GROUP(test_solo_repl_dev, (block_size, "", "block_size", "block size to io", ::cxxopts::value< uint32_t >()->default_value("4096"), "number")); From 9428bb647261a4651afe007f77108f4503534340 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Thu, 1 May 2025 11:12:20 -0700 Subject: [PATCH 111/130] Revert btree config file (#711) --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 3699ed10c..6df4e92e8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.5" + version = "6.13.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From 0a40669364c95fe2fe60288709a05a2d74d0f888 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Mon, 5 May 2025 19:04:13 -0700 Subject: [PATCH 112/130] Issue 713: Fix index table destroy race with wb_cache cp flush (#714) --- conanfile.py | 2 +- src/lib/index/inplace_btree/wb_cache.cpp | 61 ++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6df4e92e8..aa0a5563d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.6" + version = "6.13.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index 793c2e8ef..092dc9e5b 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -43,7 +43,7 @@ IndexWBCacheBase& wb_cache() { IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size) : m_vdev{vdev}, - m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size, + m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size, [](const BtreeNodePtr& node) -> BlkId { return static_cast< IndexBtreeNode* >(node.get())->m_idx_buf->m_blkid; }, @@ -590,6 +590,33 @@ void IndexWBCache::recover(sisl::byte_view sb) { if (buf->m_created_cp_id == icp_ctx->id()) { // New nodes need to be commited first m_vdev->commit_blk(buf->m_blkid); +<<<<<<< HEAD:src/lib/index/inplace_btree/wb_cache.cpp +======= + // it can happen when children moved to one of right parent sibling and then the previous node is + // deleted but not commited during crash (upbuffer is not committed). but its children already + // committed. and freed (or changed) + if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); } + } else { + LOGINFO("deleting and creating new buf {}", buf->to_string()); + deleted_bufs.push_back(buf); + } + // 1- upbuffer was dirtied by the same cp, so it is not commited, so we don't need to repair it. + // remove it from down_waiting list (probably recursively going up) 2- upbuffer was created and + // freed at the same cp, so it is not commited, so we don't need to repair it. + if (buf->m_up_buffer) { + LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(), + buf->m_up_buffer->to_string()); + buf->m_up_buffer->remove_down_buffer(buf); + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, + "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", + buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), + buf->to_string()); + update_up_buffer_counters(buf->m_up_buffer /*,visited_bufs*/); + } + buf->m_up_buffer = nullptr; +>>>>>>> f30f0d44 (Issue 713: Fix index table destroy race with wb_cache cp flush (#714)):src/lib/index/wb_cache.cpp } pending_bufs.push_back(buf); buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency @@ -608,6 +635,10 @@ void IndexWBCache::recover(sisl::byte_view sb) { // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers update_up_buffer_counters(buf->m_up_buffer); } +<<<<<<< HEAD:src/lib/index/inplace_btree/wb_cache.cpp +======= + // buf->m_up_buffer = nullptr; +>>>>>>> f30f0d44 (Issue 713: Fix index table destroy race with wb_cache cp flush (#714)):src/lib/index/wb_cache.cpp } } } @@ -618,12 +649,33 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); #endif +<<<<<<< HEAD:src/lib/index/inplace_btree/wb_cache.cpp for (auto const& buf : pending_bufs) { recover_buf(buf); if (buf->m_bytes != nullptr && r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { // This buffer was marked as deleted during repair, so we also need to free it deleted_bufs.push_back(buf); +======= + uint32_t cnt = 0; + LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", potential_parent_recovered_bufs.size()); + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - check stale recovered buf {}", cnt++, buf->to_string()); + } + // This step is needed since there is a case where all(or some) children of an interior node is freed (after moving + // to a previous sibling parent) and after crash, this node has stale links to its children + cnt = 0; + std::vector< IndexBufferPtr > buffers_to_repair; + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - potential parent recovered buf {}", cnt, buf->to_string()); + parent_recover(buf); + if (buf->m_bytes == nullptr || r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { + // This buffer was marked as deleted during repair, so we also need to free it + deleted_bufs.push_back(buf); + } else { + // This buffer was not marked as deleted during repair, so we need to repair it + buffers_to_repair.push_back(buf); +>>>>>>> f30f0d44 (Issue 713: Fix index table destroy race with wb_cache cp flush (#714)):src/lib/index/wb_cache.cpp } } @@ -773,8 +825,11 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const if (buf->is_meta_buf()) { LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; - if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } + auto const sb_buf = r_cast< MetaIndexBuffer* >(buf.get()); + if (sb_buf->m_valid) { + auto const& sb = sb_buf->m_sb; + if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } + } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), From cee9bac28267b22056441639974cc8b7b90142b3 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Thu, 8 May 2025 10:00:23 -0700 Subject: [PATCH 113/130] =?UTF-8?q?Issue=20716:=20Fix=20log=20periodic=20c?= =?UTF-8?q?ancelt=5Fimer=20issue=20and=20solo=20repl=20dev=20init/destroy?= =?UTF-8?q?=20ra=E2=80=A6=20(#715)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix log periodic cancelt_imer issue and solo repl dev init/destroy race issue --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 7 ++++--- src/lib/replication/repl_dev/solo_repl_dev.cpp | 18 +++++++++++++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 5 ++++- .../replication/service/generic_repl_svc.cpp | 17 ++++++++++------- 5 files changed, 36 insertions(+), 13 deletions(-) diff --git a/conanfile.py b/conanfile.py index aa0a5563d..880a2bcc9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.8" + version = "6.13.9" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index cf4309e00..49d040c41 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -188,9 +188,10 @@ void LogDev::start_timer() { void LogDev::stop_timer() { if (m_flush_timer_hdl != iomgr::null_timer_handle) { - // cancel the timer - iomanager.run_on_wait(logstore_service().flush_thread(), - [this]() { iomanager.cancel_timer(m_flush_timer_hdl, true); }); + iomanager.run_on_forget(logstore_service().flush_thread(), [this]() { + iomanager.cancel_timer(m_flush_timer_hdl, true); + m_flush_timer_hdl = iomgr::null_timer_handle; + }); } } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 131c09abb..7ec093062 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -1,6 +1,7 @@ #include #include "replication/repl_dev/solo_repl_dev.h" #include "replication/repl_dev/common.h" +#include #include #include #include @@ -10,6 +11,7 @@ namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { + m_logdev_id = m_rd_sb->logdev_id; logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER); logstore_service() .open_log_store(m_rd_sb->logdev_id, m_rd_sb->logstore_id, true /* append_mode */) @@ -17,6 +19,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi m_data_journal = std::move(log_store); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3)); + m_is_recovered = true; }); } else { m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER); @@ -24,6 +27,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_rd_sb->logdev_id = m_logdev_id; m_rd_sb.write(); + m_is_recovered = true; } } @@ -46,6 +50,17 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } else { write_journal(std::move(rreq)); } } +// destroy is only called in worker thread; +void SoloReplDev::destroy() { + HS_REL_ASSERT(iomanager.am_i_worker_reactor(), "Destroy should be called in worker thread"); + while (!m_is_recovered) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + hs()->logstore_service().remove_log_store(m_logdev_id, m_data_journal->get_store_id()); + hs()->logstore_service().destroy_log_dev(m_logdev_id); +} + void SoloReplDev::write_journal(repl_req_ptr_t rreq) { rreq->create_journal_entry(false /* raft_buf */, 1); @@ -209,6 +224,7 @@ void SoloReplDev::cp_flush(CP*) { m_rd_sb.write(); } -void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ } +void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ +} } // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 35f089ec5..a690c4bc0 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -30,10 +30,11 @@ class CP; class SoloReplDev : public ReplDev { private: logdev_id_t m_logdev_id; - std::shared_ptr< HomeLogStore > m_data_journal; + std::shared_ptr< HomeLogStore > m_data_journal{nullptr}; superblk< repl_dev_superblk > m_rd_sb; uuid_t m_group_id; std::atomic< logstore_seq_num_t > m_commit_upto{-1}; + std::atomic< bool > m_is_recovered{false}; public: SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); @@ -95,6 +96,8 @@ class SoloReplDev : public ReplDev { void cp_flush(CP* cp); void cp_cleanup(CP* cp); + void destroy(); + private: void write_journal(repl_req_ptr_t rreq); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 082ef746b..05945d3a7 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -119,8 +119,8 @@ void SoloReplService::stop() { hs()->data_service().stop(); } -AsyncReplResult< shared< ReplDev > > -SoloReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { +AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) { superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.create(); rd_sb->group_id = group_id; @@ -156,17 +156,20 @@ folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_ // 1. Firstly stop the repl dev which waits for any outstanding requests to finish rdev_ptr->stop(); - // 2. detaches both ways: + // 2. Destroy the repl dev which will remove the logstore and free the memory; + dp_cast< SoloReplDev >(rdev_ptr)->destroy(); + + // 3. detaches both ways: // detach rdev from its listener and listener from rdev; rdev_ptr->detach_listener(); { - // 3. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to + // 4. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to // this instance; std::unique_lock lg(m_rd_map_mtx); m_rd_map.erase(group_id); } - // 4. now destroy the upper layer's listener instance; + // 5. now destroy the upper layer's listener instance; m_repl_app->destroy_repl_dev_listener(group_id); return folly::makeSemiFuture(ReplServiceError::OK); @@ -204,14 +207,14 @@ std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_ folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { - if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } + if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } }); return folly::makeFuture< bool >(true); } void SoloReplServiceCPHandler::cp_cleanup(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { - if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } + if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } }); } From a302aa6b525843df1ab630702245c3a01aa748e3 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Mon, 12 May 2025 14:34:36 -0700 Subject: [PATCH 114/130] Issue 717: expose data service drive type (#718) --- conanfile.py | 2 +- src/include/homestore/blkdata_service.hpp | 34 +++++++++++++++++++ src/lib/blkdata_svc/blkdata_service.cpp | 3 ++ src/lib/device/virtual_dev.hpp | 1 + src/lib/logstore/log_dev.cpp | 3 +- src/lib/logstore/log_store_service.cpp | 9 +++++ .../replication/repl_dev/solo_repl_dev.cpp | 2 ++ 7 files changed, 52 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 880a2bcc9..55f6ddb24 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.9" + version = "6.13.10" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index 786375d4f..4f77af760 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -220,10 +220,44 @@ class BlkDataService { */ void start(); + /** + * @brief Gets the total capacity of the block data service. + * + * This function returns the total capacity of the block data service, in bytes. + * + * @return The total capacity of the block data service, in bytes. + */ uint64_t get_total_capacity() const; + /** + * @brief Gets the used capacity of the block data service. + * + * This function returns the used capacity of the block data service, in bytes. + * + * @return The used capacity of the block data service, in bytes. + */ uint64_t get_used_capacity() const; + /** + * @brief Gets the drive type of the data service. + * + * Data Service doesn't support mixed drive types. + * + * @return The drive type of the data service, HDD or NVME. + */ + HSDevType get_dev_type() const; + + /** + * @brief Gets the drive type of the data service. + * + * Data Service doesn't support mixed drive types. + * + * @return The drive type of the data service, HDD or NVME. + */ + HSDevType get_dev_type() const; + + void stop(); + private: /** * @brief Initializes the block data service. diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 1decfb2a2..f56f36ed5 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -34,6 +34,7 @@ BlkDataService::BlkDataService(shared< ChunkSelector > chunk_selector) : m_custom_chunk_selector{std::move(chunk_selector)} { m_blk_read_tracker = std::make_unique< BlkReadTracker >(); } + BlkDataService::~BlkDataService() = default; // first-time boot path @@ -279,6 +280,8 @@ uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); } +HSDevType BlkDataService::get_dev_type() const { return static_cast< HSDevType >(m_vdev->get_dev_type()); } + uint32_t BlkDataService::get_align_size() const { return m_vdev->align_size(); } } // namespace homestore diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index 36032954e..eb6b63192 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -292,6 +292,7 @@ class VirtualDev { virtual nlohmann::json get_status(int log_level) const; virtual uint64_t get_total_chunk_num() const { return m_total_chunk_num; } + uint8_t get_dev_type() const { return m_vdev_info.hs_dev_type; } uint32_t align_size() const; uint32_t optimal_page_size() const; uint32_t atomic_page_size() const; diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 49d040c41..93cd8456f 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -68,7 +68,8 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { m_logdev_meta.create(m_logdev_id, m_flush_mode); m_vdev_jd->update_data_start_offset(0); } else { - HS_LOG_ASSERT(!m_logdev_meta.is_empty(), "Expected meta data to be read already before loading"); + HS_LOG_ASSERT(!m_logdev_meta.is_empty(), + "Expected meta data to be read already before loading this log dev id: {}", m_logdev_id); auto const store_list = m_logdev_meta.load(); // Notify to the caller that a new log store was reserved earlier and it is being loaded, with its meta info diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index fd1f8df6a..1392a27b7 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -148,6 +148,9 @@ logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { } void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { + if (is_stopping()) return; + HS_LOG(INFO, logstore, "Destroying logdev {}", logdev_id); + incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { @@ -186,6 +189,7 @@ std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_ const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it == m_id_logdev_map.end()), "logdev id {} already exists", logdev_id); m_id_logdev_map.insert(std::make_pair<>(logdev_id, logdev)); + LOGINFO("Created logdev {}", logdev_id); return logdev; } @@ -285,6 +289,9 @@ folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_i } void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { + if (is_stopping()) return; + HS_LOG(INFO, logstore, "Removing logstore {} from logdev {}", store_id, logdev_id); + incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); @@ -293,6 +300,8 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor return; } it->second->remove_log_store(store_id); + HS_LOG(INFO, logstore, "Successfully removed logstore {} from logdev {}", store_id, logdev_id); + decr_pending_request_num(); COUNTER_DECREMENT(m_metrics, logstores_count, 1); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 7ec093062..dbf56a3c2 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -59,6 +59,8 @@ void SoloReplDev::destroy() { hs()->logstore_service().remove_log_store(m_logdev_id, m_data_journal->get_store_id()); hs()->logstore_service().destroy_log_dev(m_logdev_id); + + m_rd_sb.destroy(); } void SoloReplDev::write_journal(repl_req_ptr_t rreq) { From f46994b5e99ae44e89c2eeea6b9045f7cfa12373 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 13 May 2025 13:25:26 -0700 Subject: [PATCH 115/130] Fix occupied_size for prefix (#719) --- conanfile.py | 2 +- src/include/homestore/btree/node_variant/prefix_node.hpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 55f6ddb24..1cbc478b6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.10" + version = "6.13.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/node_variant/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp index 2892aec63..ab2f2eebe 100644 --- a/src/include/homestore/btree/node_variant/prefix_node.hpp +++ b/src/include/homestore/btree/node_variant/prefix_node.hpp @@ -346,6 +346,11 @@ class FixedPrefixNode : public VariantNode< K, V > { } } + uint32_t occupied_size() const override { + return (this->node_data_size() - sizeof(prefix_node_header) - (prefix_bitset_.size() / 8) - + this->available_size()); + } + bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { return has_room(1u); } uint32_t get_nth_key_size(uint32_t) const override { return dummy_key< K >.serialized_size(); } From b8b1a6d2cf1574de434e50d94540c1b42125c4c7 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Wed, 7 May 2025 16:05:34 -0700 Subject: [PATCH 116/130] add long running test with put and remove --- conanfile.py | 2 +- src/tests/test_index_crash_recovery.cpp | 406 ++++++++++++++++-------- src/tests/test_scripts/index_test.py | 35 +- 3 files changed, 300 insertions(+), 143 deletions(-) diff --git a/conanfile.py b/conanfile.py index 1cbc478b6..dc22220d2 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.11" + version = "6.13.12" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 3e75854e6..719698be8 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -109,6 +109,17 @@ class SequenceGenerator { OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } + if(putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} in_use_key_cnt_ {}, numOperations {}", + end_range_, start_range_, in_use_key_cnt_.load(), numOperations); + return operations; + } + if(removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("Not enough keys are in use, skipping operation generation. in_use_key_cnt_ {} numOperations {}", + in_use_key_cnt_.load(), numOperations); + return operations; + } + while (operations.size() < numOperations) { uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); @@ -119,9 +130,11 @@ class SequenceGenerator { if (operation == OperationType::Put && !inUse) { operations.emplace_back(key, OperationType::Put); inUse = true; + in_use_key_cnt_.fetch_add(1); } else if (operation == OperationType::Remove && inUse) { operations.emplace_back(key, OperationType::Remove); inUse = false; + in_use_key_cnt_.fetch_sub(1); } } @@ -222,6 +235,7 @@ class SequenceGenerator { std::uniform_int_distribution<> keyDist_; std::discrete_distribution<> opTypeDist_; std::map< uint64_t, bool > keyStates; + std::atomic< uint64_t > in_use_key_cnt_{0}; void updateOperationTypeDistribution() { opTypeDist_ = @@ -230,6 +244,19 @@ class SequenceGenerator { }; #ifdef _PRERELEASE + +struct long_running_crash_options { + uint32_t put_freq; + std::vector< std::string > put_flips{}; + std::vector< std::string > remove_flips{}; + uint32_t num_entries{SISL_OPTIONS["num_entries"].as< uint32_t >()}; + uint32_t preload_size{SISL_OPTIONS["preload_size"].as< uint32_t >()}; + uint32_t rounds{SISL_OPTIONS["num_rounds"].as< uint32_t >()}; + uint32_t num_entries_per_rounds{SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >()}; + bool load_mode{SISL_OPTIONS.count("load_from_file") > 0}; + bool save_mode{SISL_OPTIONS.count("save_to_file") > 0}; +}; + template < typename TestType > struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test { using T = TestType; @@ -440,8 +467,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Sanity check passed for {} keys!", count); } - void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + void crash_and_recover_common(OperationList& operations, std::string filename = "") { + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -481,8 +508,218 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT << "shadow map size and tree size mismatch"; } + void crash_and_recover(std::string& flip, OperationList& operations, std::string filename = "") { + this->remove_flip(flip); + this->crash_and_recover_common(operations, filename); + } + + void crash_and_recover(std::vector< std::string >& flips, OperationList& operations, std::string filename = "") { + for (auto const& flip : flips) { + this->remove_flip(flip); + } + this->crash_and_recover_common(operations, filename); + } + uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); } + void long_running_crash(long_running_crash_options const& crash_test_options) { + // set putFreq 100 for the initial load + SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, crash_test_options.num_entries - 1 /*end_range*/); + + std::vector< std::string > flips; + OperationList operations; + auto m_start_time = Clock::now(); + auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; + double elapsed_time, progress_percent, last_progress_time = 0; + bool renew_btree_after_crash = false; + auto cur_put_flip_idx = 0; + auto cur_remove_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (crash_test_options.save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", crash_test_options.preload_size); + if (crash_test_options.load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(crash_test_options.preload_size, true /* reset */); + if (crash_test_options.save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + } + + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + uint32_t num_keys{0}; + + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + num_keys++; + } + + generator.setPutFrequency(crash_test_options.put_freq); + generator.setRemoveFrequency(100 - crash_test_options.put_freq); + + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + // this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; + round <= crash_test_options.rounds && !time_to_stop(); round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + + if (crash_test_options.load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(crash_test_options.num_entries_per_rounds, renew_btree_after_crash /* reset */); + if (crash_test_options.save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + if(operations.empty()) { + LOGDEBUG("No operations generated, skipping round {}", round); + continue; + } + + flips.clear(); + if (crash_test_options.load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + std::istringstream iss(line); + std::string flip; + while (iss >> flip) { + flips.emplace_back(flip); + } + auto log_str = fmt::format("Step 1-{}: Set flag", round); + for(auto const& f : flips) { + log_str += fmt::format(" {}", f); + this->set_basic_flip(f, 1, 100); + } + LOGINFO("{}", log_str); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + if(!crash_test_options.put_flips.empty()) { + flips.emplace_back(crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]); + } + if(!crash_test_options.remove_flips.empty()) { + flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ % crash_test_options.remove_flips.size()]); + } + auto log_str = fmt::format("Step 1-{}: Set flag", round); + for(auto const& f : flips) { + log_str += fmt::format(" {}", f); + this->set_basic_flip(f, 1, 100); + } + LOGINFO("{}", log_str); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (crash_test_options.save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { + std::string out_line{"normal"}; + if (!normal_execution) { + out_line = flips[0]; + for (size_t i = 1; i < flips.size(); i++) { + out_line += " " + flips[i]; + } + } + file << out_line << "\n"; + } + file.close(); + } + } + + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + + for (auto [k, op] : operations) { + if (op == OperationType::Remove) { + if(num_keys < 1) { + // remove flips and continue + for (auto const& flip : flips) { + this->remove_flip(flip); + } + continue; + } + LOGDEBUG("Removing key {}", k); + this->remove_one(k, true /* expect_success */); + num_keys--; + } else { + if (num_keys >= crash_test_options.num_entries) { + // remove flips and continue + for (auto const& flip : flips) { + this->remove_flip(flip); + } + continue; + } + LOGDEBUG("Inserting key {}", k); + this->put(k, btree_put_type::INSERT, true /* expect_success */); + num_keys++; + } + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + // remove the flips so that they do not get triggered erroneously + this->crash_and_recover(flips, operations, fmt::format("long_tree_{}", round)); + } + if (elapsed_time - last_progress_time > 30) { + last_progress_time = elapsed_time; + print_time = true; + } + if (print_time) { + LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time, this->m_run_time, + elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), crash_test_options.num_entries, + this->tree_key_count() * 100.0 / crash_test_options.num_entries); + } + // this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; + } + this->destroy_btree(); + log_obj_life_counter(); + } + protected: const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt"; }; @@ -579,152 +816,39 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); this->put(k, btree_put_type::INSERT, true /* expect_success */); } - this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + this->crash_and_recover(flips[i], operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); if (renew_btree_after_crash) { this->reset_btree(); }; } } TYPED_TEST(IndexCrashTest, long_running_put_crash) { + long_running_crash_options crash_test_options{ + .put_freq = 100, + .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", + "crash_flush_on_split_at_right_child"}, + }; + this->long_running_crash(crash_test_options); +} - // Define the lambda function - auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - auto const preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); - auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); - auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); - bool load_mode = SISL_OPTIONS.count("load_from_file"); - bool save_mode = SISL_OPTIONS.count("save_to_file"); - SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); - vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", - "crash_flush_on_split_at_right_child"}; - - std::string flip = ""; - OperationList operations; - auto m_start_time = Clock::now(); - auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; - double elapsed_time, progress_percent, last_progress_time = 0; - bool renew_btree_after_crash = false; - auto cur_flip_idx = 0; - std::uniform_int_distribution<> dis(1, 100); - int flip_percentage = 90; // Set the desired percentage here - bool normal_execution = true; - bool clean_shutdown = true; - // if it is safe then delete all previous save files - if (save_mode) { - std::filesystem::remove_all("/tmp/operations_*.txt"); - std::filesystem::remove_all("/tmp/flips_history.txt"); - } - // init tree - LOGINFO("Step 0: Fill up the tree with {} entries", preload_size); - if (load_mode) { - operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); - } else { - operations = generator.generateOperations(preload_size, true /* reset */); - if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } - } - auto opstr = SequenceGenerator::printOperations(operations); - LOGINFO("Lets before crash print operations\n{}", opstr); - - for (auto [k, _] : operations) { - this->put(k, btree_put_type::INSERT, true /* expect_success */); - } +TYPED_TEST(IndexCrashTest, long_running_remove_crash) { + long_running_crash_options crash_test_options{ + .put_freq = 0, + .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" + /*, "crash_flush_on_freed_child"*/}, + .preload_size = SISL_OPTIONS["num_entries"].as< uint32_t >(), + }; + this->long_running_crash(crash_test_options); +} - // Trigger the cp to make sure middle part is successful - LOGINFO("Step 0-1: Flush all the entries so far"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - this->m_shadow_map.save(this->m_shadow_filename); - // this->print_keys("reapply: after preload"); - this->visualize_keys("tree_after_preload.dot"); - - for (uint32_t round = 1; - round <= rounds && !time_to_stop() && this->tree_key_count() < num_entries - num_entries_per_rounds; round++) { - LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); - bool print_time = false; - elapsed_time = get_elapsed_time_sec(m_start_time); - if (load_mode) { - std::ifstream file("/tmp/flips_history.txt"); - std::string line; - bool found = false; - for (uint32_t i = 0; i < round && std::getline(file, line); i++) { - if (i == round - 1) { - found = true; - break; - } - } - if (found && !line.empty()) { - if (line == "normal") { - normal_execution = true; - } else { - normal_execution = false; - flip = line; - LOGINFO("Step 1-{}: Set flag {}", round, flip); - this->set_basic_flip(flip, 1, 100); - } - } - file.close(); - } else { - if (dis(g_re) <= flip_percentage) { - flip = flips[cur_flip_idx++ % flips.size()]; - LOGINFO("Step 1-{}: Set flag {}", round, flip); - this->set_basic_flip(flip, 1, 100); - normal_execution = false; - } else { - normal_execution = true; - LOGINFO("Step 1-{}: No flip set", round); - } - if (save_mode) { - // save the filp name to a file for later use - std::ofstream file("/tmp/flips_history.txt", std::ios::app); - if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } - file.close(); - } - } - if (load_mode) { - operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); - } else { - operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); - if (save_mode) { - SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); - } - } - LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); - for (auto [k, _] : operations) { - this->put(k, btree_put_type::INSERT, true /* expect_success */); - if (!time_to_stop()) { - static bool print_alert = false; - if (print_alert) { - LOGINFO("It is time to stop but let's finish this round and then stop!"); - print_alert = false; - } - } - } - if (normal_execution) { - if (clean_shutdown) { - this->m_shadow_map.save(this->m_shadow_filename); - this->restart_homestore(); - } else { - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - } - } else { - // remove the flips so that they do not get triggered erroneously - this->remove_flip(flip); - this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); - } - if (elapsed_time - last_progress_time > 30) { - last_progress_time = elapsed_time; - print_time = true; - } - if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " - "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", - round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, - elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, - this->tree_key_count() * 100.0 / num_entries); - } - // this->print_keys(fmt::format("reapply: after round {}", round)); - if (renew_btree_after_crash) { this->reset_btree(); }; - } +TYPED_TEST(IndexCrashTest, long_running_put_remove_crash) { + long_running_crash_options crash_test_options{ + .put_freq = 50, + .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", + "crash_flush_on_split_at_right_child"}, + .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" + /*, "crash_flush_on_freed_child"*/}, + }; + this->long_running_crash(crash_test_options); } // Basic reverse and forward order remove with different flip points diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 564bd61c5..9ea87432b 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -99,9 +99,33 @@ def long_running_crash_put(options): options['run_time'] = 14400 # 4 hours options['preload_size'] = 1024 print(f"options: {options}") - run_crash_test(options) + run_crash_test(options, 'put', 0) print("Long running crash put completed") +def long_running_crash_remove(options): + print("Long running crash remove started") + options['num_entries'] = 1000 + options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['num_entries_per_rounds'] = 100 + options['min_keys_in_node'] = 2 + options['max_keys_in_node'] = 10 + print(f"options: {options}") + run_crash_test(options, 'remove', 0) + print("Long running crash put completed") + +def long_running_crash_put_remove(options): + print("Long running crash put_remove started") + options['num_entries'] = 2000 # 1280K + options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['preload_size'] = 1024 + options['min_keys_in_node'] = 3 + options['max_keys_in_node'] = 10 + print(f"options: {options}") + run_crash_test(options, 'put_remove', 0) + print("Long running crash put_remove completed") + def main(): options = parse_arguments() @@ -120,6 +144,15 @@ def main(): def long_running(*args): options = parse_arguments() + for i in range(20): + print(f"Iteration {i + 1}") + long_running_crash_put_remove(options) + for i in range(50): + print(f"Iteration {i + 1}") + long_running_crash_remove(options) + for i in range(5): + print(f"Iteration {i + 1}") + long_running_crash_put(options) long_runnig_index(options) long_running_clean_shutdown(options) long_running_crash_put(options) From 2be54cdd4dcf0f3daf77101398c0c513403cf143 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Fri, 16 May 2025 11:15:59 -0700 Subject: [PATCH 117/130] Fix prefix - reload compactbitset after updating node phys buffer (#724) --- conanfile.py | 3 +-- src/include/homestore/btree/node_variant/prefix_node.hpp | 5 ++++- src/include/homestore/btree/node_variant/variant_node.hpp | 1 + src/tests/test_mem_btree.cpp | 4 ++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index dc22220d2..73684a334 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,8 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.12" - + version = "6.13.13" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" topics = ("ebay", "nublox") diff --git a/src/include/homestore/btree/node_variant/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp index ab2f2eebe..795a1a78e 100644 --- a/src/include/homestore/btree/node_variant/prefix_node.hpp +++ b/src/include/homestore/btree/node_variant/prefix_node.hpp @@ -164,7 +164,10 @@ class FixedPrefixNode : public VariantNode< K, V > { } virtual ~FixedPrefixNode() = default; - + virtual void on_update_phys_buf() override { + // Update the prefix bitset with the new buffer + prefix_bitset_ = sisl::CompactBitSet{sisl::blob{bitset_area(), prefix_bitset_.size() / 8}, false}; + } ///////////////////////////// All overrides of BtreeIntervalNode /////////////////////////////////// /// @brief Upserts a batch of entries into a prefix node. /// diff --git a/src/include/homestore/btree/node_variant/variant_node.hpp b/src/include/homestore/btree/node_variant/variant_node.hpp index 332402b5a..77ae054bc 100644 --- a/src/include/homestore/btree/node_variant/variant_node.hpp +++ b/src/include/homestore/btree/node_variant/variant_node.hpp @@ -311,5 +311,6 @@ class VariantNode : public BtreeNode { } return ret; } + virtual void on_update_phys_buf() override {}; }; } // namespace homestore \ No newline at end of file diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index af50c12c2..5c6a15b59 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -46,8 +46,8 @@ SISL_OPTION_GROUP( ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value< uint32_t >()->default_value("0"), ""), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), + ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) From 8c61a8a47b84ac8f15e6a519fa0e0e95dd91ca9c Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Sat, 17 May 2025 07:24:52 -0700 Subject: [PATCH 118/130] [Solo repl dev] Fix log dev flush timer cancel race (#723) * Wait on cancel_timer during stop logdev --- conanfile.py | 3 ++- src/lib/logstore/log_dev.cpp | 24 +++++++++++++++++------- src/lib/logstore/log_dev.hpp | 2 +- src/tests/test_meta_blk_mgr.cpp | 6 ++++-- src/tests/test_scripts/log_meta_test.py | 2 +- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/conanfile.py b/conanfile.py index 73684a334..33464ed56 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,8 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.13" + version = "6.13.14" + homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" topics = ("ebay", "nublox") diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 93cd8456f..1fa262dc7 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -163,13 +163,17 @@ void LogDev::stop() { } folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); - for (auto& [_, store] : m_id_logstore_map) + for (auto& [_, store] : m_id_logstore_map) { store.log_store->stop(); + } // after we call stop, we need to do any pending device truncations truncate(); m_id_logstore_map.clear(); - if (allow_timer_flush()) stop_timer(); + if (allow_timer_flush()) { + auto f = stop_timer(); + std::move(f).get(); + } } void LogDev::destroy() { @@ -187,13 +191,19 @@ void LogDev::start_timer() { }); } -void LogDev::stop_timer() { - if (m_flush_timer_hdl != iomgr::null_timer_handle) { - iomanager.run_on_forget(logstore_service().flush_thread(), [this]() { +folly::Future< int > LogDev::stop_timer() { + // return future to the caller; + // this future will be completed when the timer is stopped + auto p = std::make_shared< folly::Promise< int > >(); + auto f = p->getFuture(); + iomanager.run_on_forget(logstore_service().flush_thread(), [this, p]() mutable { + if (m_flush_timer_hdl != iomgr::null_timer_handle) { iomanager.cancel_timer(m_flush_timer_hdl, true); m_flush_timer_hdl = iomgr::null_timer_handle; - }); - } + } + p->setValue(0); + }); + return f; } void LogDev::do_load(off_t device_cursor) { diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 43428d07e..f3cc03f1d 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -727,7 +727,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { private: void start_timer(); - void stop_timer(); + folly::Future< int > stop_timer(); bool allow_inline_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::INLINE); } bool allow_timer_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::TIMER); } diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index 8d47cb24a..d3c5401e9 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -185,7 +185,7 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t total_size_written(const void* cookie) { return m_mbm->meta_size(cookie); } void do_write_to_full() { - static constexpr uint64_t blkstore_overhead = 4 * 1024ul * 1024ul; // 4MB + static constexpr uint64_t blkstore_overhead = 256 * 1024ul * 1024ul; // 256MB ssize_t free_size = uint64_cast(m_mbm->total_size() - m_mbm->used_size() - blkstore_overhead); HS_REL_ASSERT_GT(free_size, 0); @@ -193,8 +193,10 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t size_written{0}; while (free_size > 0) { + LOGDEBUG("free size: {}, total size: {}, used size: {}, available blks: {}", free_size, m_mbm->total_size(), + m_mbm->used_size(), m_mbm->available_blks()); // if it is overflow, 2 extra blocks are needed for ovf blk header and meta blk; - if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) { + if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) { size_written = do_sb_write(do_overflow(), 0); } else { size_written = do_sb_write(false, m_mbm->meta_blk_context_sz()); diff --git a/src/tests/test_scripts/log_meta_test.py b/src/tests/test_scripts/log_meta_test.py index 5ffda0018..83c8f994f 100755 --- a/src/tests/test_scripts/log_meta_test.py +++ b/src/tests/test_scripts/log_meta_test.py @@ -85,7 +85,7 @@ def meta_nightly(options, addln_opts): subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT, shell=True) - cmd_opts = "--gtest_filter=VMetaBlkMgrTest.random_load_test --gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true" # write to file instead of real disk to save time; + cmd_opts = "--gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true" # write to file instead of real disk to save time; subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT, shell=True) From 417929e184cf113c7ddfe76d923be1b77113073f Mon Sep 17 00:00:00 2001 From: Sanal Date: Tue, 20 May 2025 11:11:28 -0700 Subject: [PATCH 119/130] Add submit_io_batch api in repl dev. (#725) Use submit_io_batch when part_of_batch is set to true for read/write. --- conanfile.py | 2 +- src/include/homestore/blkdata_service.hpp | 7 +++++++ src/lib/blkdata_svc/blkdata_service.cpp | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 33464ed56..2befa139d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.14" + version = "6.14.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index 4f77af760..1147b169f 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -151,6 +151,13 @@ class BlkDataService { folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false); + /** + * @brief Submit the io batch, which is a mandatory method to be called if read/write are issued with part_of_batch + * is set to true. In those cases, without this method, IOs might not be even issued. No-op if previous io requests + * are not part of batch. + * */ + void submit_io_batch(); + /** * @brief Commits the block with the given MultiBlkId. * diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index f56f36ed5..579930a63 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -203,6 +203,8 @@ BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > return collect_all_futures(s_futs); } +void BlkDataService::submit_io_batch() { m_vdev->submit_batch(); } + BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { if (is_stopping()) return BlkAllocStatus::FAILED; incr_pending_request_num(); From f254520b1477111434c1fc6d444da11f74f560e1 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 15 May 2025 11:38:30 +0800 Subject: [PATCH 120/130] Redesign replacemember API --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 20 +- src/include/homestore/replication/repl_dev.h | 12 +- src/include/homestore/replication_service.hpp | 12 +- src/lib/common/homestore_config.fbs | 9 +- src/lib/common/homestore_utils.hpp | 4 + src/lib/replication/repl_dev/common.cpp | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 499 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 23 +- src/lib/replication/repl_dev/solo_repl_dev.h | 9 +- .../replication/service/generic_repl_svc.cpp | 17 +- .../replication/service/generic_repl_svc.h | 12 +- .../replication/service/raft_repl_service.cpp | 55 +- .../replication/service/raft_repl_service.h | 13 +- src/tests/test_common/raft_repl_test_base.hpp | 41 +- src/tests/test_raft_repl_dev_dynamic.cpp | 111 +++- src/tests/test_solo_repl_dev.cpp | 3 +- 17 files changed, 672 insertions(+), 172 deletions(-) diff --git a/conanfile.py b/conanfile.py index 2befa139d..213c86544 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.14.0" + version = "6.15.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index bd18d4765..16c9f2a14 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -33,6 +33,12 @@ VENUM(ReplServiceError, int32_t, DATA_DUPLICATED = -20002, QUIENCE_STATE = -20003, FAILED = -32768); + +VENUM(PeerRole, uint8_t, + UNKNOWN = 0, + LEADER = 1, + FOLLOWER = 2, + LEARNER = 3); // clang-format on template < typename V, typename E > @@ -71,15 +77,15 @@ struct peer_info { // Peer ID. replica_id_t id_; // The last replication index that the peer has, from this server's point of view. - uint64_t replication_idx_; + uint64_t replication_idx_ = 0; // The elapsed time since the last successful response from this peer, set to 0 on leader - uint64_t last_succ_resp_us_; + uint64_t last_succ_resp_us_ = 0; // The priority for leader election - uint32_t priority_; - // The peer is learner or not - bool is_learner_; - // The peer is new joiner or not - bool is_new_joiner_; + uint32_t priority_ = 0; + // The peer role in replication group + PeerRole role_ = PeerRole::UNKNOWN; + // If this peer is myself + bool is_self_ = false; }; struct replica_member_info { diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index d6caf2711..ea7c156a9 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -46,7 +46,8 @@ VENUM(journal_type_t, uint16_t, HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev - HS_CTRL_REPLACE = 3, // Control message to replace a member + HS_CTRL_START_REPLACE = 3, // Control message to start replace a member + HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member ) // magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum' @@ -367,8 +368,13 @@ class ReplDevListener { /// after restart in case crash happened during the destroy. virtual void on_destroy(const group_id_t& group_id) = 0; - /// @brief Called when replace member is performed. - virtual void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) = 0; + /// @brief Called when start replace member. + virtual void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; + + /// @brief Called when complete replace member. + virtual void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 23ee2422c..56154226b 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,10 +41,16 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, - uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; + virtual AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const = 0; + virtual AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const = 0; + + virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index df90c1342..61aba97dd 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -270,7 +270,8 @@ table Consensus { stale_log_gap_lo_threshold: int32 = 30; // Minimum log gap a replica has to be from leader before joining the replica set. - min_log_gap_to_join: int32 = 2147483647; + // 0 indicates the new member will join in cluster immediately. + min_log_gap_to_join: int32 = 0; // amount of time in millis to wait on data write before fetch data from remote; wait_data_write_timer_ms: uint64 = 1500 (hotswap); @@ -310,6 +311,12 @@ table Consensus { // then decay the target_priority and wait again until its priority >= target_priority. This setting helps us to set proper priority for peers. // 0 means all members have the same priority. max_wait_rounds_of_priority_election: uint32 = 2; + + // Maximum number of retries when raft is undergoing config changing + config_changing_error_retries: int32 = 3; + + // The time to wait for config change to be applied in ms + wait_for_config_change_ms: uint32 = 500; } table HomeStoreSettings { diff --git a/src/lib/common/homestore_utils.hpp b/src/lib/common/homestore_utils.hpp index 2ee51b03d..b6989ff48 100644 --- a/src/lib/common/homestore_utils.hpp +++ b/src/lib/common/homestore_utils.hpp @@ -53,4 +53,8 @@ class hs_utils { static bool topological_sort(std::unordered_map< std::string, std::vector< std::string > >& DAG, std::vector< std::string >& ordered_entries); }; + +static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, + uint32_t interval_ms = 100); + } // namespace homestore diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 2782a36a5..3b44600ca 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -266,7 +266,7 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_REPLACE) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_START_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index dd42dd4cd..08062b5df 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -15,6 +15,7 @@ #include "common/homestore_assert.hpp" #include "common/homestore_config.hpp" +#include "common/homestore_utils.hpp" #include "replication/service/raft_repl_service.h" #include "replication/repl_dev/raft_repl_dev.h" #include "device/chunk.h" @@ -136,16 +137,16 @@ bool RaftReplDev::join_group() { return true; } -AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) { +AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown! trace_id={}", trace_id); + RD_LOGI(trace_id, "repl dev is being shutdown!"); return make_async_error<>(ReplServiceError::STOPPING); } incr_pending_request_num(); - RD_LOGI(trace_id, "Replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), + RD_LOGI(trace_id, "Start replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { @@ -153,105 +154,375 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ reset_quorum_size(commit_quorum, trace_id); } - // Step 1: Check if leader itself is requested to move out. + // Step1, validate request + auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id)); + if (!out_srv_cfg) { + RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found"); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + // Check if leader itself is requested to move out. if (m_my_repl_id == member_out.id && m_my_repl_id == get_leader_id()) { - // If leader is the member requested to move out, then give up leadership and return error. - // Client will retry replace_member request to the new leader. + // If leader is the member requested to move out, then set priority to 0(or it will be elected as leader again) + // and give up leadership and return error. Client will retry start_replace_member request to the new leader. + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out, member_out={}", + boost::uuids::to_string(member_out.id)); + if (out_srv_cfg->get_priority() != 0) { + auto ret = set_priority(member_out, 0, trace_id); + if (ret != ReplServiceError::OK) { + // Actually this is the expected path, because nuraft will BROADCAST error if we are trying to set + // leader's priority=0 + RD_LOGE(trace_id, "Step1. Replace member, set leader's priority to 0, failed {}", ret); + } + } raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); - RD_LOGI(trace_id, "Replace member leader is the member_out so yield leadership"); + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership"); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } - // Step 2. Add the new member. - return m_msg_mgr.add_member(m_group_id, member_in.id) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { - // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout - // when adding member. Member is added to cluster config until member syncs fully - // with atleast stop gap. This will take a lot of time for block or - // object storage. - if (e.hasError()) { - // Ignore the server already exists as server already added to the cluster. - // The pg member change requests from control path are idemepotent and request - // can be resend and one of the add or remove can failed and has to retried. - if (e.error() == nuraft::cmd_result_code::CANCELLED || - e.error() == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { - RD_LOGI(trace_id, "Ignoring error returned from nuraft add_member {}", e.error()); - } else { - RD_LOGE(trace_id, "Replace member error in add member : {}", e.error()); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(RaftReplService::to_repl_error(e.error())); + // Step 2: Handle out member. +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_set_learner_failure")) { + RD_LOGE(trace_id, "Simulating set member to learner failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + RD_LOGI(trace_id, "Step2. Replace member flip member to learner"); + auto learner_ret = do_flip_learner(member_out, true, true, trace_id); + if (learner_ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step2. Replace member set learner failed {}", learner_ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error(std::move(learner_ret)); + } + RD_LOGI(trace_id, "Step2. Replace member flip out member to learner and set priority to 0"); + + // Step 3. Append log entry to mark the old member is out and new member is added. + RD_LOGI(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req, group_id={}", + group_id_str()); + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + start_replace_members_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_START_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req failed {}", err); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(err)); + } + + // Step 4. Add the new member, new member will inherit the priority of the out member. +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_add_member_failure")) { + RD_LOGE(trace_id, "Simulating add member failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + RD_LOGI(trace_id, "Step4. Replace member propose to raft to add new member, group_id={}", group_id_str()); + auto ret = do_add_member(member_in, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Step4. Proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); + + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_success<>(); +} + +AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + RD_LOGI(trace_id, "Complete replace member, member={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_out.id)); + + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + + // Step 5: Remove member + RD_LOGI(trace_id, "Step5. Replace member, remove old member, member={}", boost::uuids::to_string(member_out.id)); +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) { + RD_LOGE(trace_id, "Simulating remove member failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + auto ret = do_remove_member(member_out, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, member={}, err={}", + boost::uuids::to_string(member_out.id), ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, member={}", + boost::uuids::to_string(member_out.id)); + auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); + // TODO Move wait logic to nuraft_mesg + if (!wait_and_check( + [&]() { + auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member_out.id)); + if (srv_conf) { + RD_LOGD(trace_id, "out member still exists in raft group, member={}", + boost::uuids::to_string(member_out.id)); + return false; } - } + return true; + }, + timeout)) { + RD_LOGD(trace_id, + "Step5. Replace member, wait for old member removed timed out, cancel the request, timeout: {}", + timeout); + // If the member_out is down, leader will force remove it after + // leave_timeout=leave_limit_(default=5)*heart_beat_interval_, it's better for client to retry it. + return make_async_error<>(ReplServiceError::CANCELLED); + } + RD_LOGD(trace_id, "Step5. Replace member, old member is removed, member={}", + boost::uuids::to_string(member_out.id)); + + // Step 2. Append log entry to complete replace member + RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}", + group_id_str()); + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + start_replace_members_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_COMPLETE_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , err={}", + err); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(err)); + } - RD_LOGI(trace_id, "Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), - group_id_str()); - - // Step 3. Append log entry to mark the old member is out and new member is added. - auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - replace_members_ctx members; - members.replica_out = member_out; - members.replica_in = member_in; - - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); - auto status = init_req_ctx(rreq, - repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = trace_id}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); - - if (status != ReplServiceError::OK) { - // Failed to initialize the repl_req_ctx for replace member. - RD_LOGE(trace_id, "Failed to initialize repl_req_ctx for replace member, error={}", status); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(std::move(status)); - } + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + RD_LOGI(trace_id, "Complete replace member done, group_id={}, member_out={} member_in={}", group_id_str(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + return make_async_success<>(); +} - status = m_state_machine->propose_to_raft(std::move(rreq)); - if (status != ReplServiceError::OK) { - RD_LOGE(trace_id, "Replace member propose to raft failed {}", status); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(std::move(status)); - } +ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, uint64_t trace_id) { + if (m_my_repl_id != get_leader_id()) { + RD_LOGI(trace_id, "Member to add failed, not leader"); + return ReplServiceError::BAD_REQUEST; + } + auto ret = retry_when_config_change( + [&] { + auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { + return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; + }); + return rem_ret.value(); + }, + trace_id); + if (ret == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { + RD_LOGW(trace_id, "Ignoring error returned from nuraft add_member, member={}, err={}", + boost::uuids::to_string(member.id), ret); + } else if (ret != nuraft::cmd_result_code::OK) { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret); + return ReplServiceError::RETRY_REQUEST; + } + RD_LOGI(trace_id, "Proposed to raft to add member, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::OK; +} - RD_LOGI(trace_id, "Replace member proposed to raft group_id={}", group_id_str()); - - // Step 4. Remove the old member. Even if the old member is temporarily - // down and recovers, nuraft mesg see member remove from cluster log - // entry and call exit_group() and leave(). - return m_msg_mgr.rem_member(m_group_id, member_out.id) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { - if (e.hasError()) { - // Ignore the server not found as server removed from the cluster - // as requests are idempotent and can be resend. - if (e.error() == nuraft::cmd_result_code::SERVER_NOT_FOUND) { - RD_LOGW(trace_id, "Remove member not found in group error, ignoring"); - } else { - // Its ok to retry this request as the request - // of replace member is idempotent. - RD_LOGE(trace_id, "Replace member failed to remove member : {}", e.error()); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(ReplServiceError::RETRY_REQUEST); - } - } else { - RD_LOGI(trace_id, "Replace member removed member={} from group_id={}", - boost::uuids::to_string(member_out.id), group_id_str()); - } +ReplServiceError RaftReplDev::do_remove_member(const replica_member_info& member, uint64_t trace_id) { + // The member should not be the leader. + if (m_my_repl_id == member.id && m_my_repl_id == get_leader_id()) { + // If leader is the member requested to move out, then give up leadership and return error. + // Client will retry start_replace_member request to the new leader. + raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + RD_LOGI(trace_id, "Member to remove is the leader so yield leadership"); + return ReplServiceError::NOT_LEADER; + } + auto ret = retry_when_config_change( + [&] { + auto rem_ret = m_msg_mgr.rem_member(m_group_id, member.id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { + return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; + }); + return rem_ret.value(); + }, + trace_id); + if (ret == nuraft::cmd_result_code::SERVER_NOT_FOUND) { + RD_LOGW(trace_id, "Remove member not found in group error, ignoring, member={}", + boost::uuids::to_string(member.id)); + } else if (ret != nuraft::cmd_result_code::OK) { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE(trace_id, "Replace member failed to remove member, member={}, err={}", + boost::uuids::to_string(member.id), ret); + return ReplServiceError::RETRY_REQUEST; + } + RD_LOGI(trace_id, "Proposed to raft to remove member, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::OK; +} - // Revert the quorum size back to 0. - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_success<>(); - }); - }); +AsyncReplResult<> RaftReplDev::flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify, uint64_t trace_id) { + RD_LOGI(trace_id, "Flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + auto ret = do_flip_learner(member, target, wait_and_verify, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Flip learner flag failed {}, member={}", ret, boost::uuids::to_string(member.id)); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Learner flag has been set to {}, member={}", target, boost::uuids::to_string(member.id)); + return make_async_success<>(); +} + +ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, + uint64_t trace_id) { + // 1. Prerequisite check + if (m_my_repl_id != get_leader_id()) { + RD_LOGI(trace_id, "flip learner flag failed, not leader"); + return ReplServiceError::NOT_LEADER; + } + if (!target && member.priority == 0) { + RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::BAD_REQUEST; + } + + // 2. Flip learner + RD_LOGI(trace_id, "flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); + auto srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member.id)); + if (!srv_cfg) { + RD_LOGE(trace_id, "invalid parameter, member is not found, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::SERVER_NOT_FOUND; + } + if (srv_cfg->is_learner() != target) { + auto ret = retry_when_config_change( + [&] { + auto learner_ret = raft_server()->flip_learner_flag(nuraft_mesg::to_server_id(member.id), target); + return learner_ret->get_result_code(); + }, + trace_id); + if (ret != nuraft::cmd_result_code::OK) { + RD_LOGE(trace_id, "Propose to raft to flip learner failed, err: {}", ret); + return ReplServiceError::RETRY_REQUEST; + } + } else { + RD_LOGD(trace_id, "learner flag has already been set to {}, skip, member={}", target, + boost::uuids::to_string(member.id)); + } + + // 3. Set priority + // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid + // it. And in turn, we need to revert prioiry change if the member is going to become a normal member. + auto priority = target ? 0 : member.priority; + RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id)); + if (srv_cfg->get_priority() != priority) { + auto priority_ret = set_priority(member, priority); + if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; } + } else { + RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority, + boost::uuids::to_string(member.id)); + } + + // 4. Verification + if (wait_and_verify) { + auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); + if (!wait_and_check( + [&]() { + auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member.id)); + return srv_conf->is_learner() && srv_conf->get_priority() == 0; + }, + timeout)) { + RD_LOGD(trace_id, "Wait for learner and priority config change timed out, cancel the request, timeout: {}", + timeout); + return ReplServiceError::CANCELLED; + } + } + + return ReplServiceError::OK; +} + +nuraft::cmd_result_code RaftReplDev::retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, + uint64_t trace_id) { + auto ret = nuraft::cmd_result_code::OK; + int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries); + for (auto i = 0; i < retries; i++) { + ret = func(); + if (ret == nuraft::cmd_result_code::CONFIG_CHANGING) { + RD_LOGW(trace_id, "Propose to raft failed due to config_changing, attempt: {}", i); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + continue; + } + break; + } + return ret; +} + +bool RaftReplDev::wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms) { + auto times = timeout_ms / interval_ms; + if (times == 0) { times = 1; } + for (auto i = 0; i < static_cast< int32_t >(times); i++) { + if (check_func()) { return true; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + return false; +} + +ReplServiceError RaftReplDev::set_priority(const replica_member_info& member_out, int32_t priority, uint64_t trace_id) { + auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member_out.id), priority); + // Set_priority should be handled by leader, but if the intent is to set the leader's priority to 0, it returns + // BROADCAST. In this case return NOT_LEADER to let client retry new leader. + // If there is an uncommited_config, nuraft set_priority will honor this uncommited config and generate new + // config based on it and won't have config_changing error. + if (priority_ret != nuraft::raft_server::PrioritySetResult::SET) { + RD_LOGE(trace_id, "Propose to raft to set priority failed, result: {}", + priority_ret == nuraft::raft_server::PrioritySetResult::BROADCAST ? "BROADCAST" : "IGNORED"); + return ReplServiceError::NOT_LEADER; + } + return ReplServiceError::OK; } void RaftReplDev::reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id) { @@ -1011,8 +1282,10 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { RD_LOGD(rreq->traceID(), "Raft channel: Commit rreq=[{}]", rreq->to_compact_string()); if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { leave(); - } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { - replace_member(rreq); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_START_REPLACE) { + start_replace_member(rreq); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + complete_replace_member(rreq); } else { m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); } @@ -1080,8 +1353,11 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) }); } } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY || - rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { - if (rreq->is_proposer()) { m_destroy_promise.setValue(err); } + rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + if (rreq->is_proposer()) { + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + m_destroy_promise.setValue(err); + } } // TODO: Validate if this is a correct assert or not. Is it possible that the log is already flushed and we receive @@ -1096,13 +1372,22 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) rreq->clear(); } -void RaftReplDev::replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); +void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); + + RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + + m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); +} + +void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); - RD_LOGI(rreq->traceID(), "Raft repl replace_member commit member_out={} member_in={}", + RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); - m_listener->on_replace_member(members->replica_out, members->replica_in); + m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID()); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1154,12 +1439,14 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { - pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), - .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_, - .priority_ = pinfo.priority_, - .is_learner_ = pinfo.is_learner_, - .is_new_joiner_ = pinfo.is_new_joiner_}); + auto peer = peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), + .replication_idx_ = pinfo.last_log_idx_, + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_}; + peer.role_ = pinfo.is_learner_ ? PeerRole::LEARNER : PeerRole::FOLLOWER; + if (peer.id_ == get_leader_id()) { peer.role_ = PeerRole::LEADER; } + peer.is_self_ = (peer.id_ == m_my_repl_id); + pi.emplace_back(peer); } return pi; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 42d100ebb..6a790c017 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -36,7 +36,7 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); -struct replace_members_ctx { +struct start_replace_members_ctx { replica_member_info replica_out; replica_member_info replica_in; }; @@ -224,8 +224,22 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> replace_member(const replica_member_info& member_out, const replica_member_info& member_in, - uint32_t commit_quorum, uint64_t trace_id = 0); + AsyncReplResult<> start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + uint32_t commit_quorum = 0, uint64_t trace_id = 0); + AsyncReplResult<> complete_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0); + AsyncReplResult<> flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify = true, uint64_t trace_id = 0); + ReplServiceError do_add_member(const replica_member_info& member, uint64_t trace_id = 0); + ReplServiceError do_remove_member(const replica_member_info& member, uint64_t trace_id = 0); + ReplServiceError do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, + uint64_t trace_id = 0); + ReplServiceError set_priority(const replica_member_info& member, int32_t priority, uint64_t trace_id = 0); + nuraft::cmd_result_code retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, + uint64_t trace_id = 0); + bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); + folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -419,7 +433,8 @@ class RaftReplDev : public ReplDev, void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void set_log_store_last_durable_lsn(store_lsn_t lsn); void commit_blk(repl_req_ptr_t rreq); - void replace_member(repl_req_ptr_t rreq); + void start_replace_member(repl_req_ptr_t rreq); + void complete_replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id); void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index a690c4bc0..25b0a5d8f 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -61,12 +62,8 @@ class SoloReplDev : public ReplDev { bool is_leader() const override { return true; } replica_id_t get_leader_id() const override { return m_group_id; } std::vector< peer_info > get_replication_status() const override { - return std::vector< peer_info >{peer_info{.id_ = m_group_id, - .replication_idx_ = 0, - .last_succ_resp_us_ = 0, - .priority_ = 1, - .is_learner_ = false, - .is_new_joiner_ = false}}; + return std::vector< peer_info >{ + peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1, .is_self_ = true}}; } bool is_ready_for_traffic() const override { return true; } void purge() override {} diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 05945d3a7..4aae580e9 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -195,9 +195,20 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +AsyncReplResult<> SoloReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +AsyncReplResult<> SoloReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 8fc33064c..50b8353d8 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -73,9 +73,15 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify = true, + uint64_t trace_id = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index e434f716b..6901043bf 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -123,7 +123,7 @@ void RaftReplService::start() { // new_joiner_type fully disabled log pack behavior. // There is no callback available for handling and localizing the log entries within the pack, which could // result in data corruption. - r_params.use_new_joiner_type_ = true; + r_params.use_new_joiner_type_ = false; r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io); r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); @@ -457,16 +457,61 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +AsyncReplResult<> RaftReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->replace_member(member_out, member_in, commit_quorum, trace_id) + ->start_replace_member(member_out, member_in, commit_quorum, trace_id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { + decr_pending_request_num(); + return make_async_error<>(e.error()); + } + decr_pending_request_num(); + return make_async_success<>(); + }); +} + +AsyncReplResult<> RaftReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + incr_pending_request_num(); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->complete_replace_member(member_out, member_in, commit_quorum, trace_id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { + decr_pending_request_num(); + return make_async_error<>(e.error()); + } + decr_pending_request_num(); + return make_async_success<>(); + }); +} + +AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify, uint64_t trace_id) const { + if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + incr_pending_request_num(); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->flip_learner_flag(member, target, commit_quorum, wait_and_verify, trace_id) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { return make_async_error<>(e.error()); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 27bad10f0..bb8fa3604 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -77,9 +77,16 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id = 0) const override; + + AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify = true, + uint64_t trace_id = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 0dbd539e3..2d4519b94 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -48,7 +48,7 @@ using namespace homestore; SISL_LOGGING_DEF(test_raft_repl_dev) -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg, nuraft) SISL_OPTION_GROUP(test_raft_repl_dev, (block_size, "", "block_size", "block size to io", @@ -344,8 +344,13 @@ class TestReplicatedDB : public homestore::ReplDevListener { } return blk_alloc_hints{}; } - void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override { - LOGINFO("[Replica={}] replace member out {} in {}", g_helper->replica_num(), + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + } + + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } @@ -737,19 +742,39 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + void start_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { - LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), + LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); + + replica_member_info out{member_out, ""}; + replica_member_info in{member_in, ""}; + auto result = hs()->repl_service().start_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + if (error == ReplServiceError::OK) { + ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); + } else { + ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err="<< result.error(); + ASSERT_EQ(result.error(), error); + } + }); + } + + void complete_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, + replica_id_t member_in, uint32_t commit_quorum = 0, + ReplServiceError error = ReplServiceError::OK) { + this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { + LOGINFO("Complete replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + auto result = + hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); if (error == ReplServiceError::OK) { - ASSERT_EQ(result.hasError(), false) << "Error in replacing member"; + ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); } else { - ASSERT_EQ(result.hasError(), true) << "Error in replacing member"; + ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err=" << result.error(); ASSERT_EQ(result.error(), error); } }); diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 5a6095959..0897a5201 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -38,11 +38,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_test_start(num_members); if (g_helper->replica_num() < num_replicas) { - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -55,7 +55,15 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { // Skip the member which is going to be replaced. Validate data on all other replica's. LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); - } else if (g_helper->replica_num() == member_out) { + } + + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); while (repl_dev && !repl_dev->is_destroyed()) { @@ -106,7 +114,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { // Replace down replica 2 with spare replica 3 with commit quorum 1 // so that leader can go ahead with replacing member. LOGINFO("Replace member started"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); this->write_on_leader(num_io_entries, true /* wait_for_commit */); LOGINFO("Leader completed num_io={}", num_io_entries); } @@ -154,16 +162,18 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { g_helper->sync_for_test_start(num_members); - this->shutdown_replica(2); - LOGINFO("Shutdown replica 2"); - std::this_thread::sleep_for(std::chrono::seconds(3)); if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); + } + //shut down before replace member + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + if (g_helper->replica_num() == 0) { + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -178,14 +188,46 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { this->validate_data(); } - g_helper->sync_for_cleanup_start(num_members); + //shutdown after becoming learner + // this->shutdown_replica(2); + // LOGINFO("Shutdown replica 2"); + // std::this_thread::sleep_for(std::chrono::seconds(2)); + + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + this->run_on_leader(db, [this, db, member_out, member_in]() { + replica_member_info out{g_helper->replica_id(member_out), ""}; + replica_member_info in{g_helper->replica_id(member_in), ""}; + auto result = hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in).get(); + if (result.hasError()) { + ASSERT_EQ(result.error(), ReplServiceError::CANCELLED) + << "Unexpected error in replacing member, err=" << result.error(); + LOGWARN("Error in completing replace member, err={}, will retry after 2s", result.error()); + std::this_thread::sleep_for(std::chrono::seconds(2)); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + } + }); + LOGINFO("Replace member old leader done"); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); if (g_helper->replica_num() == 2) { LOGINFO("Start replica 2"); - db->set_zombie(); this->start_replica(2); + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + db->set_zombie(); } + g_helper->sync_for_cleanup_start(num_members); LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num()); } @@ -209,18 +251,18 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { if (g_helper->replica_num() != member_in) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. this->write_on_leader(num_io_entries, true /* wait_for_commit */); // Leader will return error NOT_LEADER and yield leadership, sleep and connect again // to the new leader. LOGINFO("Replace old leader"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::NOT_LEADER); LOGINFO("Replace member leader yield done"); std::this_thread::sleep_for(std::chrono::seconds(3)); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); LOGINFO("Replace member old leader done"); } @@ -236,7 +278,24 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { this->validate_data(); } - if (g_helper->replica_num() == member_out) { db->set_zombie(); } + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + db->set_zombie(); + } g_helper->sync_for_cleanup_start(num_members); LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num()); @@ -264,11 +323,11 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { } if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -283,6 +342,24 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { this->validate_data(); } + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + } + g_helper->sync_for_cleanup_start(num_members); LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); } diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 4310d81de..57247dad7 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -130,7 +130,8 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} void on_destroy(const group_id_t& group_id) override {} void notify_committed_lsn(int64_t lsn) override {} void on_config_rollback(int64_t lsn) override {} From 156e0ab40b385a226ff61b534a842477ea207297 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Wed, 28 May 2025 11:14:45 +0800 Subject: [PATCH 121/130] Add a reaper thread to check and complete replace member --- .../homestore/replication/repl_decls.h | 15 +- src/include/homestore/replication_service.hpp | 12 +- src/lib/common/homestore_config.fbs | 3 + src/lib/replication/repl_dev/common.cpp | 2 +- src/lib/replication/repl_dev/common.h | 9 +- .../replication/repl_dev/raft_repl_dev.cpp | 197 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 16 +- src/lib/replication/repl_dev/solo_repl_dev.h | 2 +- .../replication/service/generic_repl_svc.cpp | 8 +- .../replication/service/generic_repl_svc.h | 5 +- .../replication/service/raft_repl_service.cpp | 50 +++-- .../replication/service/raft_repl_service.h | 7 +- src/tests/CMakeLists.txt | 2 +- src/tests/test_common/hs_repl_test_common.hpp | 3 +- src/tests/test_common/raft_repl_test_base.hpp | 28 +-- src/tests/test_raft_repl_dev_dynamic.cpp | 134 +++++++----- 16 files changed, 308 insertions(+), 185 deletions(-) diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 16c9f2a14..88a928aa3 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -31,14 +31,9 @@ VENUM(ReplServiceError, int32_t, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, DATA_DUPLICATED = -20002, - QUIENCE_STATE = -20003, + QUIENCE_STATE = -20003, + QUORUM_NOT_MET = -20004, FAILED = -32768); - -VENUM(PeerRole, uint8_t, - UNKNOWN = 0, - LEADER = 1, - FOLLOWER = 2, - LEARNER = 3); // clang-format on template < typename V, typename E > @@ -82,10 +77,8 @@ struct peer_info { uint64_t last_succ_resp_us_ = 0; // The priority for leader election uint32_t priority_ = 0; - // The peer role in replication group - PeerRole role_ = PeerRole::UNKNOWN; - // If this peer is myself - bool is_self_ = false; + // Whether the peer can vote. If a peer is learner, this will be false. Hide the raft details. + bool can_vote = true; }; struct replica_member_info { diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 56154226b..f28704546 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,14 +41,16 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + /// @brief Replace one of the members with a new one. + /// @param group_id Group where the replace member happens + /// @param member_out The member which is going to be replaced + /// @param member_in The member which is going to be added in place of member_out + /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum. + /// @return A Future on replace the member accepted or Future ReplServiceError upon error + virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; - virtual AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const = 0; - virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 61aba97dd..4a7f9bd8b 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -317,6 +317,9 @@ table Consensus { // The time to wait for config change to be applied in ms wait_for_config_change_ms: uint32 = 500; + + // The interval in ms to check if the new member in replace_member is fully synced and ready to take over + replace_member_sync_check_interval_ms: uint64 = 60000; } table HomeStoreSettings { diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 3b44600ca..6b8ce122b 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -266,7 +266,7 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_START_REPLACE) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index 43bbb7cbf..c3433083f 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -15,7 +15,7 @@ #pragma once #include - +#include #include #include #include @@ -95,4 +95,11 @@ auto make_async_success() { return folly::makeSemiFuture< ReplResult< folly::Unit > >(folly::Unit{}); } +inline uint64_t generateRandomTraceId() { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution< uint64_t > dis; + return dis(gen); +} + } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 08062b5df..0b6cf835b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -137,6 +137,7 @@ bool RaftReplDev::join_group() { return true; } +// All the steps in the implementation should be idempotent and retryable. AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { @@ -157,31 +158,59 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m // Step1, validate request auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id)); if (!out_srv_cfg) { + auto in_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_in.id)); + if (in_srv_cfg) { + RD_LOGI( + trace_id, + "Step1. Replace member, the intent has already been fulfilled, ignore it, member_out={} member_in={}", + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_success<>(); + } RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found"); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } + if (m_my_repl_id != get_leader_id()) { + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::NOT_LEADER); + } // Check if leader itself is requested to move out. - if (m_my_repl_id == member_out.id && m_my_repl_id == get_leader_id()) { - // If leader is the member requested to move out, then set priority to 0(or it will be elected as leader again) - // and give up leadership and return error. Client will retry start_replace_member request to the new leader. - RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out, member_out={}", - boost::uuids::to_string(member_out.id)); - if (out_srv_cfg->get_priority() != 0) { - auto ret = set_priority(member_out, 0, trace_id); - if (ret != ReplServiceError::OK) { - // Actually this is the expected path, because nuraft will BROADCAST error if we are trying to set - // leader's priority=0 - RD_LOGE(trace_id, "Step1. Replace member, set leader's priority to 0, failed {}", ret); - } - } - raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + if (m_my_repl_id == member_out.id) { + // immediate=false successor=-1, nuraft will choose an alive peer with highest priority as successor, and wait + // until the successor finishes the catch-up of the latest log, and then resign. Return NOT_LEADER and let + // client retry. + raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership"); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } + // quorum safety check. TODO currently only consider lsn, need to check last response time. + auto active_peers = get_active_peers(); + // active_peers doesn't include leader itself. + auto quorum = active_peers.size() + 1; + for (const auto& p : active_peers) { + quorum = p == member_out.id ? quorum - 1 : quorum; + quorum = p == member_in.id ? quorum - 1 : quorum; + } + RD_LOGD(trace_id, + "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, " + "commit_quorum={}", + active_peers.size(), quorum, commit_quorum); + // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be + // greater than 1. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow + // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1. + if (quorum <= 1 && commit_quorum == 0) { + RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, commit_quorum={}", + active_peers.size(), quorum, commit_quorum); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::QUORUM_NOT_MET); + } // Step 2: Handle out member. #ifdef _PRERELEASE @@ -190,25 +219,25 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step2. Replace member flip member to learner"); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner"); auto learner_ret = do_flip_learner(member_out, true, true, trace_id); if (learner_ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step2. Replace member set learner failed {}", learner_ret); + RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}", learner_ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error(std::move(learner_ret)); } - RD_LOGI(trace_id, "Step2. Replace member flip out member to learner and set priority to 0"); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0"); // Step 3. Append log entry to mark the old member is out and new member is added. - RD_LOGI(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req, group_id={}", + RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}", group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - start_replace_members_ctx members; + replace_member_ctx members; members.replica_out = member_out; members.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -217,7 +246,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req failed {}", err); + RD_LOGE(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed {}", err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -230,7 +259,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step4. Replace member propose to raft to add new member, group_id={}", group_id_str()); + RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}", group_id_str()); auto ret = do_add_member(member_in, trace_id); if (ret != ReplServiceError::OK) { RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret); @@ -238,8 +267,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step4. Proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); - + RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); @@ -307,11 +335,11 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}", group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - start_replace_members_ctx members; + replace_member_ctx members; members.replica_out = member_out; members.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -339,7 +367,7 @@ ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, u RD_LOGI(trace_id, "Member to add failed, not leader"); return ReplServiceError::BAD_REQUEST; } - auto ret = retry_when_config_change( + auto ret = retry_when_config_changing( [&] { auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id) .via(&folly::InlineExecutor::instance()) @@ -366,12 +394,12 @@ ReplServiceError RaftReplDev::do_remove_member(const replica_member_info& member // The member should not be the leader. if (m_my_repl_id == member.id && m_my_repl_id == get_leader_id()) { // If leader is the member requested to move out, then give up leadership and return error. - // Client will retry start_replace_member request to the new leader. - raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + // Client will retry replace_member request to the new leader. + raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); RD_LOGI(trace_id, "Member to remove is the leader so yield leadership"); return ReplServiceError::NOT_LEADER; } - auto ret = retry_when_config_change( + auto ret = retry_when_config_changing( [&] { auto rem_ret = m_msg_mgr.rem_member(m_group_id, member.id) .via(&folly::InlineExecutor::instance()) @@ -427,6 +455,9 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, return ReplServiceError::NOT_LEADER; } if (!target && member.priority == 0) { + // If the intent is to take the learner back to normal member, then priority should not be 0(never has chance to + // become leader). Client need to trace the peers' priority, and give a meaningful value, currently default + // priorities of the quorum: leader=100, follower=66. RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id)); return ReplServiceError::BAD_REQUEST; } @@ -439,7 +470,7 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, return ReplServiceError::SERVER_NOT_FOUND; } if (srv_cfg->is_learner() != target) { - auto ret = retry_when_config_change( + auto ret = retry_when_config_changing( [&] { auto learner_ret = raft_server()->flip_learner_flag(nuraft_mesg::to_server_id(member.id), target); return learner_ret->get_result_code(); @@ -457,10 +488,11 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, // 3. Set priority // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid // it. And in turn, we need to revert prioiry change if the member is going to become a normal member. + // FIXME after nuraft fixes the bug, we can remove this logic. auto priority = target ? 0 : member.priority; RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id)); if (srv_cfg->get_priority() != priority) { - auto priority_ret = set_priority(member, priority); + auto priority_ret = set_priority(member.id, priority); if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; } } else { RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority, @@ -485,7 +517,7 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, return ReplServiceError::OK; } -nuraft::cmd_result_code RaftReplDev::retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, +nuraft::cmd_result_code RaftReplDev::retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, uint64_t trace_id) { auto ret = nuraft::cmd_result_code::OK; int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries); @@ -511,8 +543,8 @@ bool RaftReplDev::wait_and_check(const std::function< bool() >& check_func, uint return false; } -ReplServiceError RaftReplDev::set_priority(const replica_member_info& member_out, int32_t priority, uint64_t trace_id) { - auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member_out.id), priority); +ReplServiceError RaftReplDev::set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id) { + auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member), priority); // Set_priority should be handled by leader, but if the intent is to set the leader's priority to 0, it returns // BROADCAST. In this case return NOT_LEADER to let client retry new leader. // If there is an uncommited_config, nuraft set_priority will honor this uncommited config and generate new @@ -1373,21 +1405,32 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) } void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + // record the replace_member intent + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->replace_member_ctx.replica_in = members->replica_out.id; + m_rd_sb->replace_member_ctx.replica_out = members->replica_in.id; + m_rd_sb.write(); } void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + + // clear the replace_member intent + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->replace_member_ctx = replace_member_ctx_superblk{}; + m_rd_sb.write(); + RD_LOGI(rreq->traceID(), "Raft repl replace_member_ctx has been cleared."); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1439,14 +1482,13 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { - auto peer = peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), - .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_, - .priority_ = pinfo.priority_}; - peer.role_ = pinfo.is_learner_ ? PeerRole::LEARNER : PeerRole::FOLLOWER; - if (peer.id_ == get_leader_id()) { peer.role_ = PeerRole::LEADER; } - peer.is_self_ = (peer.id_ == m_my_repl_id); - pi.emplace_back(peer); + for (auto const& pinfo : rep_status) { + pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), + .replication_idx_ = pinfo.last_log_idx_, + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_, + .can_vote = !pinfo.is_learner_}); + } } return pi; } @@ -1455,6 +1497,7 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { auto repl_status = get_replication_status(); std::set< replica_id_t > res; auto my_committed_idx = m_commit_upto_lsn.load(); + auto laggy=HS_DYNAMIC_CONFIG(consensus.laggy_threshold); uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) : 0; @@ -1465,6 +1508,10 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { if (p.id_ == m_my_repl_id) { continue; } if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); + RD_LOGW(NO_TRACE_ID, + "Found active peer {}, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}, laggy={}", p.id_, + my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx, + laggy); } else { RD_LOGW(NO_TRACE_ID, "Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", @@ -1755,6 +1802,66 @@ void RaftReplDev::flush_durable_commit_lsn() { m_rd_sb.write(); } +void RaftReplDev::check_replace_member_status() { + if (is_destroyed()) { + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore check replace member status"); + return; + } + if (!m_repl_svc_ctx || !is_leader()) { return; } + if (m_rd_sb->replace_member_ctx.replica_in == boost::uuids::nil_uuid() || + m_rd_sb->replace_member_ctx.replica_out == boost::uuids::nil_uuid()) { + RD_LOGT(NO_TRACE_ID, "No replace member in progress, return"); + return; + } + + auto peers = get_replication_status(); + repl_lsn_t in_lsn = 0; + repl_lsn_t out_lsn = 0; + repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); + + for (auto& peer : peers) { + if (peer.id_ == m_rd_sb->replace_member_ctx.replica_out) { + out_lsn = peer.replication_idx_; + RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn); + } else if (peer.id_ == m_rd_sb->replace_member_ctx.replica_in) { + in_lsn = peer.replication_idx_; + RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn); + } + } + // TODO optimize the condition + bool catch_up = in_lsn + laggy >= out_lsn; + + if (!catch_up) { + RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + return; + } + + RD_LOGD( + NO_TRACE_ID, + "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with lsn={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + + trace_id_t trace_id = generateRandomTraceId(); + + RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)); + + replica_member_info out{m_rd_sb->replace_member_ctx.replica_in, ""}; + replica_member_info in{m_rd_sb->replace_member_ctx.replica_out, ""}; + auto ret = complete_replace_member(out, in, 0, trace_id).get(); + if (ret.hasError()) { + RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error()); + return; + } + RD_LOGI(trace_id, "Complete replace member, next time will retry it, replica_in={}, replica_out={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)) +} + /////////////////////////////////// Private metohds //////////////////////////////////// void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { if (is_destroyed()) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 6a790c017..abede36bf 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -15,6 +15,10 @@ #include "replication/log_store/repl_log_store.h" namespace homestore { +struct replace_member_ctx_superblk { + replica_id_t replica_out; + replica_id_t replica_in; +}; #pragma pack(1) struct raft_repl_dev_superblk : public repl_dev_superblk { @@ -26,6 +30,7 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { uint64_t last_applied_dsn; // Last applied data sequence number uint8_t destroy_pending; // Flag to indicate whether the group is in destroy pending state repl_lsn_t last_snapshot_lsn; // Last snapshot LSN follower received from leader + replace_member_ctx_superblk replace_member_ctx; // Replace members context, used to track the replace member status uint32_t get_raft_sb_version() const { return raft_sb_version; } }; @@ -36,7 +41,7 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); -struct start_replace_members_ctx { +struct replace_member_ctx { replica_member_info replica_out; replica_member_info replica_in; }; @@ -235,8 +240,8 @@ class RaftReplDev : public ReplDev, ReplServiceError do_remove_member(const replica_member_info& member, uint64_t trace_id = 0); ReplServiceError do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, uint64_t trace_id = 0); - ReplServiceError set_priority(const replica_member_info& member, int32_t priority, uint64_t trace_id = 0); - nuraft::cmd_result_code retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, + ReplServiceError set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id = 0); + nuraft::cmd_result_code retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, uint64_t trace_id = 0); bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); @@ -366,6 +371,11 @@ class RaftReplDev : public ReplDev, */ void flush_durable_commit_lsn(); + /** + * Check the replace_member status, if the new member is fully synced up and ready to take over, remove the old member. + */ + void check_replace_member_status(); + /** * \brief This method is called during restart to notify the upper layer */ diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 25b0a5d8f..9cf41dcce 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -63,7 +63,7 @@ class SoloReplDev : public ReplDev { replica_id_t get_leader_id() const override { return m_group_id; } std::vector< peer_info > get_replication_status() const override { return std::vector< peer_info >{ - peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1, .is_self_ = true}}; + peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1}}; } bool is_ready_for_traffic() const override { return true; } void purge() override {} diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 4aae580e9..7d226c016 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -195,18 +195,12 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -AsyncReplResult<> SoloReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { - return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); -} - AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 50b8353d8..cd63a8866 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -73,12 +73,9 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; - AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 6901043bf..3cb0ad910 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -457,9 +457,15 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +// replace_member actually has two phases: +// 1. start_replace_member: flip member_out to learner and add member_in. +// 2. complete_replace_member: remove member_out. +// In this function, it only invokes replDev start_replace_member. There is +// a background reaper thread helps periodically check the member_in replication status, after in_member has caught up, +// will trigger replDev complete_replace_member. +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); @@ -478,29 +484,6 @@ AsyncReplResult<> RaftReplService::start_replace_member(group_id_t group_id, con }); } -AsyncReplResult<> RaftReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { - if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); - incr_pending_request_num(); - auto rdev_result = get_repl_dev(group_id); - if (!rdev_result) { - decr_pending_request_num(); - return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); - } - return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->complete_replace_member(member_out, member_in, commit_quorum, trace_id) - .via(&folly::InlineExecutor::instance()) - .thenValue([this](auto&& e) mutable { - if (e.hasError()) { - decr_pending_request_num(); - return make_async_error<>(e.error()); - } - decr_pending_request_num(); - return make_async_success<>(); - }); -} - AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); @@ -550,12 +533,19 @@ void RaftReplService::start_reaper_thread() { HS_DYNAMIC_CONFIG(consensus.flush_durable_commit_interval_ms) * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { flush_durable_commit_lsn(); }); + // Check replace_member sync status to see a new member is fully synced up and ready to remove the old member + m_replace_member_sync_check_timer_hdl = iomanager.schedule_thread_timer( + HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */, + nullptr, [this](void*) { check_replace_member_status(); }); + + p.setValue(); } else { // Cancel all recurring timers started iomanager.cancel_timer(m_rdev_gc_timer_hdl, true /* wait */); iomanager.cancel_timer(m_rdev_fetch_timer_hdl, true /* wait */); iomanager.cancel_timer(m_flush_durable_commit_timer_hdl, true /* wait */); + iomanager.cancel_timer(m_replace_member_sync_check_timer_hdl, true /* wait */); } }); std::move(f).get(); @@ -644,6 +634,14 @@ void RaftReplService::flush_durable_commit_lsn() { } } +void RaftReplService::check_replace_member_status() { + std::unique_lock lg(m_rd_map_mtx); + for (auto& rdev_parent : m_rd_map) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second); + rdev->check_replace_member_status(); + } +} + ///////////////////// RaftReplService CP Callbacks ///////////////////////////// int ReplSvcCPContext::add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx) { m_cp_ctx_map.emplace(dev, dev_ctx); diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index bb8fa3604..aa9550c4f 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -52,6 +52,7 @@ class RaftReplService : public GenericReplService, iomgr::timer_handle_t m_rdev_fetch_timer_hdl; iomgr::timer_handle_t m_rdev_gc_timer_hdl; iomgr::timer_handle_t m_flush_durable_commit_timer_hdl; + iomgr::timer_handle_t m_replace_member_sync_check_timer_hdl; iomgr::io_fiber_t m_reaper_fiber; std::mutex raft_restart_mutex; @@ -77,12 +78,9 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; - AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, @@ -96,6 +94,7 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); + void check_replace_member_status(); void monitor_cert_changes(); void restart_raft_svc(const std::string filepath, const bool deleted); bool wait_for_cert(const std::string& filepath); diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 95bb695ad..3fbf5c544 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -126,7 +126,7 @@ if (${io_tests}) add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) - # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) + add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 92ff45a69..c00788127 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -301,7 +301,8 @@ class HSReplTestHelper : public HSTestHelper { auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get(); ASSERT_EQ(v.hasValue(), true) - << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str(); + << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str() + << ", err=" << v.error(); auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); auto follower_priority = raftService.compute_raft_follower_priority(); auto repl_dev = v.value(); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 2d4519b94..80eeb1573 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -742,7 +742,7 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void start_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out), @@ -750,32 +750,12 @@ class RaftReplDevTestBase : public testing::Test { replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto result = hs()->repl_service().start_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); if (error == ReplServiceError::OK) { ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); } else { - ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err="<< result.error(); - ASSERT_EQ(result.error(), error); - } - }); - } - - void complete_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, - replica_id_t member_in, uint32_t commit_quorum = 0, - ReplServiceError error = ReplServiceError::OK) { - this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { - LOGINFO("Complete replace member out={} in={}", boost::uuids::to_string(member_out), - boost::uuids::to_string(member_in)); - - replica_member_info out{member_out, ""}; - replica_member_info in{member_in, ""}; - auto result = - hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); - if (error == ReplServiceError::OK) { - ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); - } else { - ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err=" << result.error(); - ASSERT_EQ(result.error(), error); + ASSERT_EQ(result.hasError(), true); + ASSERT_EQ(result.error(), error) << "Error in replacing member, err=" << result.error(); } }); } diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 0897a5201..4ae56a9c3 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -13,6 +13,8 @@ * *********************************************************************************/ #include "test_common/raft_repl_test_base.hpp" +#include +#include "common/homestore_config.hpp" // Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev. class ReplDevDynamicTest : public RaftReplDevTestBase { @@ -38,11 +40,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_test_start(num_members); if (g_helper->replica_num() < num_replicas) { - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -56,13 +58,10 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); } - - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); + + //wait for background reaper thread to trigger complete_replace_member if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -114,7 +113,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { // Replace down replica 2 with spare replica 3 with commit quorum 1 // so that leader can go ahead with replacing member. LOGINFO("Replace member started"); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); this->write_on_leader(num_io_entries, true /* wait_for_commit */); LOGINFO("Leader completed num_io={}", num_io_entries); } @@ -145,11 +144,11 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num()); } -TEST_F(ReplDevDynamicTest, OneMemberDown) { +TEST_F(ReplDevDynamicTest, OutMemberDown) { // replica0(leader) and replica1 up, replica2 is down. Replace replica2 with replica3. // replica0 should be able to baseline resync to replica4(new member). // Write some IO's, replace a member, validate all members data except which is out. - LOGINFO("OneMemberDown test started replica={}", g_helper->replica_num()); + LOGINFO("OutMemberDown test started replica={}", g_helper->replica_num()); auto db = dbs_.back(); auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); @@ -164,7 +163,7 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { std::this_thread::sleep_for(std::chrono::seconds(3)); if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); } @@ -173,7 +172,7 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { LOGINFO("Shutdown replica 2"); if (g_helper->replica_num() == 0) { - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -188,30 +187,16 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { this->validate_data(); } - //shutdown after becoming learner + // shutdown after becoming learner, in this case, the member_out won't remove replDev after restart. // this->shutdown_replica(2); // LOGINFO("Shutdown replica 2"); // std::this_thread::sleep_for(std::chrono::seconds(2)); - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - this->run_on_leader(db, [this, db, member_out, member_in]() { - replica_member_info out{g_helper->replica_id(member_out), ""}; - replica_member_info in{g_helper->replica_id(member_in), ""}; - auto result = hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in).get(); - if (result.hasError()) { - ASSERT_EQ(result.error(), ReplServiceError::CANCELLED) - << "Unexpected error in replacing member, err=" << result.error(); - LOGWARN("Error in completing replace member, err={}, will retry after 2s", result.error()); - std::this_thread::sleep_for(std::chrono::seconds(2)); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - } - }); - - LOGINFO("Replace member old leader done"); - + // data synced, waiting for removing learner + LOGINFO("data synced, sync for completing replace member, replica={}", g_helper->replica_num()); g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + // Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED + // error, so waiting time is longer than other tests. if (g_helper->replica_num() == 2) { LOGINFO("Start replica 2"); this->start_replica(2); @@ -249,20 +234,21 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { g_helper->sync_for_test_start(num_members); - if (g_helper->replica_num() != member_in) { + if (g_helper->replica_num() == member_out) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. this->write_on_leader(num_io_entries, true /* wait_for_commit */); // Leader will return error NOT_LEADER and yield leadership, sleep and connect again // to the new leader. LOGINFO("Replace old leader"); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::NOT_LEADER); LOGINFO("Replace member leader yield done"); - - std::this_thread::sleep_for(std::chrono::seconds(3)); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + } + std::this_thread::sleep_for(std::chrono::seconds(3)); + if (g_helper->replica_num() != member_in) { + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); LOGINFO("Replace member old leader done"); } @@ -278,12 +264,8 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { this->validate_data(); } - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -318,16 +300,16 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { g_helper->sync_for_test_start(num_members); if (g_helper->replica_num() == 1) { - LOGINFO("Restart replica 1"); + LOGINFO("Restart replica 1, "); this->restart_replica(15); } if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -342,12 +324,8 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { this->validate_data(); } - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -364,6 +342,60 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); } +TEST_F(ReplDevDynamicTest, ValidateRequest) { + LOGINFO("ValidateRequest test started replica={}", g_helper->replica_num()); + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.consensus.laggy_threshold = 0; + LOGINFO("setup consensus.laggy_threshold to {}", 0); + HS_SETTINGS_FACTORY().save(); + }); + + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + //shut down before replace member + this->shutdown_replica(1); + LOGINFO("Shutdown replica 1"); + + //wait for shutdown + std::this_thread::sleep_for(std::chrono::seconds(3)); + g_helper->sync_for_verify_start(num_members); + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + } + g_helper->sync_for_verify_start(num_members); + if (g_helper->replica_num() == 0) { + // generate uuid + replica_id_t fake_member_out = boost::uuids::random_generator()(); + replica_id_t fake_member_in = boost::uuids::random_generator()(); + LOGINFO("test SERVER_NOT_FOUND"); + replace_member(db, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND); + LOGINFO("test replace_member already complete"); + replace_member(db, fake_member_out, g_helper->replica_id(0)); + LOGINFO("test QUORUM_NOT_MET", num_io_entries, g_helper->replica_num()); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + ReplServiceError::QUORUM_NOT_MET); + } + + if (g_helper->replica_num() == 1) { + LOGINFO("Start replica 1"); + this->start_replica(1); + } + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("ValidateRequest test done replica={}", g_helper->replica_num()); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; From be10be4204cd4b2fb3cb7f5fe908ea95cd850c68 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 29 May 2025 15:23:33 +0800 Subject: [PATCH 122/130] Fix replace_member --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 36 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/conanfile.py b/conanfile.py index 213c86544..453140526 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.0" + version = "6.15.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 0b6cf835b..e8a0ef576 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1413,8 +1413,8 @@ void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); // record the replace_member intent std::unique_lock lg{m_sb_mtx}; - m_rd_sb->replace_member_ctx.replica_in = members->replica_out.id; - m_rd_sb->replace_member_ctx.replica_out = members->replica_in.id; + m_rd_sb->replace_member_ctx.replica_in = members->replica_in.id; + m_rd_sb->replace_member_ctx.replica_out = members->replica_out.id; m_rd_sb.write(); } @@ -1508,7 +1508,7 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { if (p.id_ == m_my_repl_id) { continue; } if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); - RD_LOGW(NO_TRACE_ID, + RD_LOGT(NO_TRACE_ID, "Found active peer {}, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}, laggy={}", p.id_, my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx, laggy); @@ -1815,15 +1815,17 @@ void RaftReplDev::check_replace_member_status() { } auto peers = get_replication_status(); + auto replica_in = m_rd_sb->replace_member_ctx.replica_in; + auto replica_out = m_rd_sb->replace_member_ctx.replica_out; repl_lsn_t in_lsn = 0; repl_lsn_t out_lsn = 0; repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); for (auto& peer : peers) { - if (peer.id_ == m_rd_sb->replace_member_ctx.replica_out) { + if (peer.id_ == replica_out) { out_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn); - } else if (peer.id_ == m_rd_sb->replace_member_ctx.replica_in) { + } else if (peer.id_ == replica_in) { in_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn); } @@ -1833,33 +1835,29 @@ void RaftReplDev::check_replace_member_status() { if (!catch_up) { RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); return; } - RD_LOGD( - NO_TRACE_ID, - "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with lsn={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + RD_LOGD(NO_TRACE_ID, + "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with " + "lsn={}", + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); trace_id_t trace_id = generateRandomTraceId(); RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)); + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)); - replica_member_info out{m_rd_sb->replace_member_ctx.replica_in, ""}; - replica_member_info in{m_rd_sb->replace_member_ctx.replica_out, ""}; + replica_member_info out{replica_out, ""}; + replica_member_info in{replica_in, ""}; auto ret = complete_replace_member(out, in, 0, trace_id).get(); if (ret.hasError()) { RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error()); return; } - RD_LOGI(trace_id, "Complete replace member, next time will retry it, replica_in={}, replica_out={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)) + RD_LOGI(trace_id, "Complete replace member, replica_in={}, replica_out={}", + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) } /////////////////////////////////// Private metohds //////////////////////////////////// From a05b089cb26c96bc4f202748488abfcc1f5fa989 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Fri, 30 May 2025 19:55:20 +0800 Subject: [PATCH 123/130] fix bug in get_replication_status --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/conanfile.py b/conanfile.py index 453140526..dc3bef22b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.1" + version = "6.15.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index e8a0ef576..2303fda68 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1482,13 +1482,11 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { - for (auto const& pinfo : rep_status) { - pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), - .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_, - .priority_ = pinfo.priority_, - .can_vote = !pinfo.is_learner_}); - } + pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), + .replication_idx_ = pinfo.last_log_idx_, + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_, + .can_vote = !pinfo.is_learner_}); } return pi; } From f3003480227747513062e954dc5c446bdd5f67f6 Mon Sep 17 00:00:00 2001 From: Brian Szmyd Date: Mon, 2 Jun 2025 09:11:24 -0600 Subject: [PATCH 124/130] The usage of EVP_DigestInit_ex2 in meta_blk test requires openssl3.x (#732) --- conanfile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conanfile.py b/conanfile.py index dc3bef22b..4c96b1854 100644 --- a/conanfile.py +++ b/conanfile.py @@ -60,6 +60,9 @@ def requirements(self): if self.settings.arch in ['x86', 'x86_64']: self.requires("isa-l/2.30.0", transitive_headers=True) + # Tests require OpenSSL 3.x + self.requires("openssl/[^3.1]", override=True) + def imports(self): self.copy(root_package="sisl", pattern="*", dst="bin/scripts/python/flip/", src="bindings/flip/python/", keep_path=False) From aa2855451a9956df36856e85829472f61e557bd4 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 2 Jun 2025 18:23:56 -0700 Subject: [PATCH 125/130] Fix prefix merge and enable long running (#729) --- .jenkins/jenkinsfile_nightly | 8 +- conanfile.py | 2 +- .../btree/detail/btree_remove_impl.ipp | 10 ++ .../btree/node_variant/prefix_node.hpp | 94 ++++++++++++++----- src/tests/btree_helpers/btree_test_helper.hpp | 2 +- src/tests/test_btree_long_running | 8 +- src/tests/test_btree_node.cpp | 33 +++++++ .../test_common/homestore_test_common.hpp | 2 +- src/tests/test_mem_btree.cpp | 4 + src/tests/test_scripts/CMakeLists.txt | 19 +--- src/tests/test_scripts/index_test.py | 4 + 11 files changed, 141 insertions(+), 45 deletions(-) diff --git a/.jenkins/jenkinsfile_nightly b/.jenkins/jenkinsfile_nightly index 7100a0230..8083c816b 100644 --- a/.jenkins/jenkinsfile_nightly +++ b/.jenkins/jenkinsfile_nightly @@ -55,10 +55,10 @@ pipeline { find /home/jenkins -type f -wholename '*/test_data_service' -exec cp {} .jenkins/test_data_service \\; find /home/jenkins -type f -wholename '*/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\; find /home/jenkins -type f -wholename '*/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\; - find /home/jenkins -type f -wholename '*/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; - find /home/jenkins -type f -wholename '*/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; - find /home/jenkins -type f -wholename '*/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; - find /home/jenkins -type f -wholename '*/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; ''' } post { diff --git a/conanfile.py b/conanfile.py index 4c96b1854..1add3e2ab 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.2" + version = "6.15.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 67acd7d5a..04e483377 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -469,6 +469,9 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const BT_NODE_LOG_ASSERT_EQ(child->is_node_deleted(), false, child); old_nodes.push_back(child); + // Todo: need a more precise calculation considering compacted size for prefix nodes because when merge happens + // compaction will occur for both leftmost and new nodes. This calculation makes available size not be balanced + // for the leftmost node and new nodes. total_size += child->occupied_size(); } @@ -506,6 +509,13 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const auto const nentries = old_nodes[i]->num_entries_by_size(0, available_size); if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in available_size -= old_nodes[i]->occupied_size(); + // For prefix nodes, compaction will make the size smaller, so we can compact saving to available size; + // hence it cannot get negative. + if (old_nodes[i]->get_node_type() == btree_node_type::PREFIX) { + auto cur_node = static_cast< FixedPrefixNode< K, V >* >(old_nodes[i].get()); + available_size += cur_node->compact_saving(); + } + BT_NODE_DBG_ASSERT_EQ(available_size >= 0, true, leftmost_node, "negative available size"); if (i >= old_nodes.size() - 1) { src_cursor.ith_node = i + 1; src_cursor.nth_entry = std::numeric_limits< uint32_t >::max(); diff --git a/src/include/homestore/btree/node_variant/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp index 795a1a78e..e63e2c1d8 100644 --- a/src/include/homestore/btree/node_variant/prefix_node.hpp +++ b/src/include/homestore/btree/node_variant/prefix_node.hpp @@ -307,7 +307,6 @@ class FixedPrefixNode : public VariantNode< K, V > { } } if (num_removed) { this->inc_gen(); } - #ifndef NDEBUG validate_sanity(); #endif @@ -340,10 +339,18 @@ class FixedPrefixNode : public VariantNode< K, V > { } } + uint16_t get_nth_suffix_slot_num(uint32_t idx) const { return get_suffix_entry_c(idx)->prefix_slot; } + + uint16_t get_nth_prefix_ref_count(uint32_t idx) const { + return get_prefix_entry_c(get_suffix_entry_c(idx)->prefix_slot)->ref_count; + } + + uint32_t compact_saving() const { return num_prefix_holes() * prefix_entry::size(); } + uint32_t available_size() const override { auto num_holes = num_prefix_holes(); if (num_holes > prefix_node_header::min_holes_to_compact) { - return available_size_without_compaction() + (num_holes * prefix_entry::size()); + return available_size_with_compaction(); } else { return available_size_without_compaction(); } @@ -432,7 +439,6 @@ class FixedPrefixNode : public VariantNode< K, V > { // part of Step 1, except generation count this->inc_gen(); dst_node.inc_gen(); - auto new_phdr = dst_node.prefix_header(); if (!this->is_leaf() && (dst_node.total_entries() != 0)) { // Incase this node is an edge node, move the stick to the right hand side node @@ -668,10 +674,11 @@ class FixedPrefixNode : public VariantNode< K, V > { } std::string to_string(bool print_friendly = false) const override { - auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ", - (print_friendly ? "------------------------------------------------------------\n" : ""), - this->node_id(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode()); + auto str = + fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} occupied_size={} ", + (print_friendly ? "------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), + this->next_bnode(), this->available_size(), this->occupied_size()); if (!this->is_leaf() && (this->has_valid_edge())) { fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -682,9 +689,10 @@ class FixedPrefixNode : public VariantNode< K, V > { prefix_bitset_.to_string(cbitset_blob())); for (uint32_t i{0}; i < this->total_entries(); ++i) { - fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, - BtreeNode::get_nth_key< K >(i, false).to_string(), - this->get_nth_value(i, false).to_string()); + fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={} slot#={} ref_count={}]", + (print_friendly ? "\n\t" : " "), i + 1, BtreeNode::get_nth_key< K >(i, false).to_string(), + this->get_nth_value(i, false).to_string(), this->get_nth_suffix_slot_num(i), + this->get_nth_prefix_ref_count(i)); } return str; } @@ -713,7 +721,9 @@ class FixedPrefixNode : public VariantNode< K, V > { auto phdr = prefix_header(); ++phdr->used_slots; - if (slot_num > phdr->tail_slot) { phdr->tail_slot = slot_num; } + if (slot_num + 1u > phdr->tail_slot) { phdr->tail_slot = slot_num + 1u; } + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", + slot_num, phdr->tail_slot); return slot_num; } @@ -746,17 +756,16 @@ class FixedPrefixNode : public VariantNode< K, V > { uint8_t const* suffix = r_cast< uint8_t const* >(get_suffix_entry_c(this->total_entries())); uint8_t const* prefix = r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)); - if (suffix <= prefix) { - return prefix - suffix; + if (suffix <= prefix + prefix_entry::size()) { + return prefix - suffix + prefix_entry::size(); } else { - DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area"); + DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area {}", + int64_t(suffix - prefix)); return 0; } } - uint32_t available_size_with_compaction() const { - return available_size_without_compaction() + (num_prefix_holes() * prefix_entry::size()); - } + uint32_t available_size_with_compaction() const { return available_size_without_compaction() + compact_saving(); } bool has_room(uint16_t for_nentries) const { return (available_size_without_compaction() >= (prefix_entry::size() + (for_nentries * suffix_entry::size()))); @@ -768,7 +777,9 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t num_prefix_holes() const { auto phdr = cprefix_header(); - return (phdr->tail_slot + 1 - phdr->used_slots); + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", + phdr->used_slots, phdr->tail_slot); + return (phdr->tail_slot - phdr->used_slots); } bool is_compaction_suggested() const { return (num_prefix_holes() > prefix_node_header::min_holes_to_compact); } @@ -811,6 +822,9 @@ class FixedPrefixNode : public VariantNode< K, V > { // Finally adjust the tail offset to the compacted area. auto phdr = prefix_header(); phdr->tail_slot = phdr->used_slots; + DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(0u), + "Tail slot is not equal to the next reset bit, not expected"); + DEBUG_ASSERT_EQ(this->num_prefix_holes(), 0, "Shouldn't be any hole after compression, not expected"); } #ifndef NDEBUG @@ -851,13 +865,15 @@ class FixedPrefixNode : public VariantNode< K, V > { uint8_t const* csuffix_kv_area() const { return cbitset_area() + bitset_size(); } prefix_entry* get_prefix_entry(uint16_t slot_num) { - return r_cast< prefix_entry* >(this->node_data_area() + - (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + return r_cast< prefix_entry* >( + this->node_data_area() + + (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } prefix_entry const* get_prefix_entry_c(uint16_t slot_num) const { - return r_cast< prefix_entry const* >(this->node_data_area_const() + - (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + return r_cast< prefix_entry const* >( + this->node_data_area_const() + + (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } suffix_entry* get_suffix_entry(uint16_t idx) { @@ -869,5 +885,39 @@ class FixedPrefixNode : public VariantNode< K, V > { static constexpr uint32_t get_key_size() { return prefix_entry::key_size() + suffix_entry::key_size(); } static constexpr uint32_t get_value_size() { return prefix_entry::value_size() + suffix_entry::value_size(); } + + std::string compact_bitset() const { + auto x = prefix_bitset_.to_string(); + std::ostringstream result; + std::vector< size_t > indices; + for (size_t i = 0; i < x.size(); ++i) { + if (x[i] == '1') { indices.push_back(i); } + } + + if (indices.empty()) { return result.str(); } + + size_t start = indices[0]; + size_t end = start; + result << "size = " << indices.size() << " : "; + for (size_t i = 1; i < indices.size(); ++i) { + if (indices[i] == end + 1) { + end = indices[i]; + } else { + if (start == end) { + result << start << ", "; + } else { + result << start << "-" << end << ", "; + } + start = end = indices[i]; + } + } + if (start == end) { + result << start; + } else { + result << start << "-" << end; + } + + return result.str(); + } }; } // namespace homestore diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index a7e14df41..9b2b07c52 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -276,7 +276,7 @@ struct BtreeTestHelper { } void range_remove_existing_random() { - static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5}; + static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 50}; auto const [start_k, end_k] = m_shadow_map.pick_random_existing_keys(s_rand_range_generator(m_re)); do_range_remove(start_k, end_k, true /* only_existing */); diff --git a/src/tests/test_btree_long_running b/src/tests/test_btree_long_running index 380a906ab..3c9ff5ffa 100644 --- a/src/tests/test_btree_long_running +++ b/src/tests/test_btree_long_running @@ -275,7 +275,7 @@ TYPED_TEST(BtreeTest, TriggerCacheEviction) { s.resource_limits.cache_size_percent = 1u; HS_SETTINGS_FACTORY().save(); }); - + this->restart_homestore(); LOGINFO("TriggerCacheEviction test start"); @@ -661,6 +661,8 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin this->m_bt->count_keys(this->m_bt->root_node_id())); BtreeTestHelper< TestType >::TearDown(); m_helper.shutdown_homestore(false); + this->m_bt.reset(); + log_obj_life_counter(); } private: @@ -691,6 +693,10 @@ int main(int argc, char* argv[]) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } auto ret = RUN_ALL_TESTS(); return ret; diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 1634984f3..d97511961 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -341,6 +341,39 @@ TYPED_TEST(NodeTest, SequentialInsert) { this->validate_get_any(98, 102); } +TYPED_TEST(NodeTest, SimpleInsert) { + auto oc = this->m_node1->occupied_size(); + this->put(1, btree_put_type::INSERT); + this->put(2, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + this->remove(2); + this->remove(1); + this->remove(3); + auto oc2 = this->m_node1->occupied_size(); + ASSERT_EQ(oc, oc2) << "Occupied size cannot be more than original size"; + this->put(1, btree_put_type::INSERT); + this->put(2, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + this->remove(3); + this->remove(2); + this->remove(1); + ASSERT_EQ(oc, oc2) << "Occupied size must be the same as original size"; + + this->put(2, btree_put_type::INSERT); + this->put(1, btree_put_type::INSERT); + this->put(4, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + for (uint32_t i = 5; i <= 50; ++i) { + this->put(i, btree_put_type::INSERT); + } + LOGDEBUG("Creating a hole with size of 11 for prefix compaction usecase"); + for (uint32_t i = 10; i <= 20; ++i) { + this->remove(i); + } + this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, 20); + this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, 0, std::numeric_limits< uint32_t >::max()); +} + TYPED_TEST(NodeTest, ReverseInsert) { for (uint32_t i{100}; (i > 0 && this->has_room()); --i) { this->put(i - 1, btree_put_type::INSERT); diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 56013c9ec..5391e1685 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -349,7 +349,7 @@ class HSTestHelper { auto fut = homestore::hs()->cp_mgr().trigger_cp_flush(true /* force */); auto on_complete = [&](auto success) { HS_REL_ASSERT_EQ(success, true, "CP Flush failed"); - LOGINFO("CP Flush completed"); + LOGDEBUG("CP Flush completed"); }; if (wait) { diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 5c6a15b59..83330422d 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -332,6 +332,10 @@ int main(int argc, char* argv[]) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } auto ret = RUN_ALL_TESTS(); return ret; diff --git a/src/tests/test_scripts/CMakeLists.txt b/src/tests/test_scripts/CMakeLists.txt index e1b5ff78c..4bb54bad5 100644 --- a/src/tests/test_scripts/CMakeLists.txt +++ b/src/tests/test_scripts/CMakeLists.txt @@ -1,15 +1,4 @@ -file(COPY vol_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY home_blk_flip.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY home_blk_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY index_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY log_meta_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY data_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY long_running.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) - -#add_test(NAME TestVolRecovery COMMAND ${CMAKE_BINARY_DIR}/bin/scripts/vol_test.py --test_suits=recovery --dirpath=${CMAKE_BINARY_DIR}/bin/) -#SET_TESTS_PROPERTIES(TestVolRecovery PROPERTIES DEPENDS TestVol) - -#add_test(NAME PerfTestVol COMMAND perf_test_volume) -#add_test(NAME RecoveryVol COMMAND python vol_test.py) -#add_test(NAME CheckBtree COMMAND check_btree) - +file(COPY index_test.py DESTINATION ../test_scripts) +file(COPY log_meta_test.py DESTINATION ../test_scripts) +file(COPY data_test.py DESTINATION ../test_scripts) +file(COPY long_running.py DESTINATION ../test_scripts) diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 9ea87432b..b9e55a15e 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -144,6 +144,10 @@ def main(): def long_running(*args): options = parse_arguments() + long_runnig_index(options, 0) + long_running_clean_shutdown(options, 0) + long_runnig_index(options, 1) + long_running_clean_shutdown(options, 1) for i in range(20): print(f"Iteration {i + 1}") long_running_crash_put_remove(options) From f0b515946ae508d5b3efe087f6642605e6022133 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 2 Jun 2025 23:47:57 -0700 Subject: [PATCH 126/130] Fix overlapping range and enable index crash recovery for prefix (#735) --- conanfile.py | 2 +- .../btree/node_variant/prefix_node.hpp | 32 +++++++++- src/tests/btree_helpers/btree_test_kvs.hpp | 23 +++++-- src/tests/test_btree_node.cpp | 8 ++- src/tests/test_index_crash_recovery.cpp | 63 ++++++++++--------- 5 files changed, 92 insertions(+), 36 deletions(-) diff --git a/conanfile.py b/conanfile.py index 1add3e2ab..11f1691dc 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.3" + version = "6.15.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/node_variant/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp index e63e2c1d8..21b1830a4 100644 --- a/src/include/homestore/btree/node_variant/prefix_node.hpp +++ b/src/include/homestore/btree/node_variant/prefix_node.hpp @@ -88,6 +88,21 @@ class FixedPrefixNode : public VariantNode< K, V > { } } + int compare(BtreeKey const& key, BtreeValue const& val) const { + if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { + sisl::blob const kblob = s_cast< K const& >(key).serialize_prefix(); + sisl::blob const vblob = s_cast< V const& >(val).serialize_prefix(); + DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Prefix key size mismatch with serialized prefix size"); + DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Prefix value size mismatch with serialized prefix size"); + uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(prefix_entry); + int cmp = std::memcmp(cur_ptr, kblob.cbytes(), kblob.size()); + if (cmp) { return cmp; } + cmp = std::memcmp(cur_ptr + kblob.size(), vblob.cbytes(), vblob.size()); + return cmp; + } + return 0; + } + sisl::blob key_buf() const { return sisl::blob{r_cast< uint8_t const* >(this) + sizeof(prefix_entry), key_size()}; } @@ -318,6 +333,7 @@ class FixedPrefixNode : public VariantNode< K, V > { ///////////////////////////// All overrides of BtreeNode /////////////////////////////////// void get_nth_key_internal(uint32_t idx, BtreeKey& out_key, bool) const override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); suffix_entry const* sentry = get_suffix_entry_c(idx); prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot); DEBUG_ASSERT(prefix_bitset_.is_bit_set(cbitset_blob(), sentry->prefix_slot), @@ -361,7 +377,13 @@ class FixedPrefixNode : public VariantNode< K, V > { this->available_size()); } - bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { return has_room(1u); } + bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { +#ifdef _PRERELEASE + auto max_keys = this->max_keys_in_node(); + if (max_keys && this->total_entries() > max_keys) { return false; } +#endif + return has_room(1u); + } uint32_t get_nth_key_size(uint32_t) const override { return dummy_key< K >.serialized_size(); } @@ -606,6 +628,14 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t copy_internal(BtreeNode const& o, uint32_t start_idx, bool by_size, uint32_t limit) { FixedPrefixNode const& src_node = s_cast< FixedPrefixNode const& >(o); +#ifdef _PRERELEASE + if (by_size) { + const uint32_t max_keys = this->max_keys_in_node(); + if (max_keys) { + if (this->total_entries() + limit > max_keys) { limit = max_keys - this->total_entries(); } + } + } +#endif // Adjust the size_to_move to cover the new node's reqd header space. uint32_t copied_size{0}; diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index cac6bc4dc..c1baa8f38 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -60,6 +60,17 @@ static std::string gen_random_string(size_t len, uint32_t preamble = std::numeri } return str; } +template < typename T > +static bool willAdditionOverflow(T a, int b) { + static_assert(std::is_integral< T >::value, "Template parameter must be an integral type."); + + if (b > 0) { + return a > std::numeric_limits< T >::max() - b; + } else if (b < 0) { + return a < std::numeric_limits< T >::min() - b; + } + return false; +} using namespace homestore; @@ -310,7 +321,7 @@ class TestIntervalKey : public BtreeIntervalKey { m_offset = other->m_offset; } - std::string to_string() const override { return fmt::format("{}.{}", m_base, m_offset); } + std::string to_string() const override { return fmt::format("{}", key()); } static uint32_t get_max_size() { return sizeof(TestIntervalKey); } @@ -323,9 +334,10 @@ class TestIntervalKey : public BtreeIntervalKey { int distance(BtreeKey const& f) const override { TestIntervalKey const& from = s_cast< TestIntervalKey const& >(f); - DEBUG_ASSERT_EQ(m_base, from.m_base, "Invalid from key for distance"); - DEBUG_ASSERT_GE(m_offset, from.m_offset, "Invalid from key for distance"); - return m_offset - from.m_offset; + uint64_t this_val = (uint64_cast(m_base) << 32) | m_offset; + uint64_t from_val = (uint64_cast(from.m_base) << 32) | from.m_offset; + DEBUG_ASSERT_GE(this_val, from_val, "Invalid from key for distance"); + return static_cast< int >(this_val - from_val); } bool is_interval_key() const override { return true; } @@ -519,7 +531,8 @@ class TestIntervalValue : public BtreeIntervalValue { m_offset = other->m_offset; } - std::string to_string() const override { return fmt::format("{}.{}", m_base_val, m_offset); } + std::string to_string() const override { return fmt::format("{}", value()); } + uint64_t value() const { return (uint64_cast(m_base_val) << 16) | m_offset; } friend std::ostream& operator<<(std::ostream& os, const TestIntervalValue& v) { os << v.to_string(); diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index d97511961..9d883fd50 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -104,7 +104,7 @@ struct NodeTest : public testing::Test { } } - void put_range(uint32_t k, uint32_t count) { + void put_range(uint64_t k, uint32_t count) { btree_put_type put_type; if constexpr (!std::is_same_v< V, TestIntervalValue >) { // For non-interval values we support only update, so we need to first put the value @@ -374,6 +374,12 @@ TYPED_TEST(NodeTest, SimpleInsert) { this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, 0, std::numeric_limits< uint32_t >::max()); } +TYPED_TEST(NodeTest, RangeChangeInsert) { + if (this->m_node1->get_node_type() != btree_node_type::PREFIX) {return;} + this->put_range(0xFFFFFFFF - 10,20); + this->print(); +} + TYPED_TEST(NodeTest, ReverseInsert) { for (uint32_t i{100}; (i > 0 && this->has_room()); --i) { this->put(i - 1, btree_put_type::INSERT); diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 719698be8..35b44eeaf 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -109,15 +109,16 @@ class SequenceGenerator { OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } - if(putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) { - LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} in_use_key_cnt_ {}, numOperations {}", + if (putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} " + "in_use_key_cnt_ {}, numOperations {}", end_range_, start_range_, in_use_key_cnt_.load(), numOperations); - return operations; + return operations; } - if(removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) { + if (removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) { LOGDEBUG("Not enough keys are in use, skipping operation generation. in_use_key_cnt_ {} numOperations {}", in_use_key_cnt_.load(), numOperations); - return operations; + return operations; } while (operations.size() < numOperations) { @@ -524,7 +525,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void long_running_crash(long_running_crash_options const& crash_test_options) { // set putFreq 100 for the initial load - SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, crash_test_options.num_entries - 1 /*end_range*/); + SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, + crash_test_options.num_entries - 1 /*end_range*/); std::vector< std::string > flips; OperationList operations; @@ -549,9 +551,11 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); } else { operations = generator.generateOperations(crash_test_options.preload_size, true /* reset */); - if (crash_test_options.save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + if (crash_test_options.save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); + } } - + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); uint32_t num_keys{0}; @@ -571,8 +575,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT // this->print_keys("reapply: after preload"); this->visualize_keys("tree_after_preload.dot"); - for (uint32_t round = 1; - round <= crash_test_options.rounds && !time_to_stop(); round++) { + for (uint32_t round = 1; round <= crash_test_options.rounds && !time_to_stop(); round++) { LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds); bool print_time = false; elapsed_time = get_elapsed_time_sec(m_start_time); @@ -580,12 +583,13 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT if (crash_test_options.load_mode) { operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); } else { - operations = generator.generateOperations(crash_test_options.num_entries_per_rounds, renew_btree_after_crash /* reset */); + operations = generator.generateOperations(crash_test_options.num_entries_per_rounds, + renew_btree_after_crash /* reset */); if (crash_test_options.save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); } } - if(operations.empty()) { + if (operations.empty()) { LOGDEBUG("No operations generated, skipping round {}", round); continue; } @@ -612,7 +616,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT flips.emplace_back(flip); } auto log_str = fmt::format("Step 1-{}: Set flag", round); - for(auto const& f : flips) { + for (auto const& f : flips) { log_str += fmt::format(" {}", f); this->set_basic_flip(f, 1, 100); } @@ -622,14 +626,16 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT file.close(); } else { if (dis(g_re) <= flip_percentage) { - if(!crash_test_options.put_flips.empty()) { - flips.emplace_back(crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]); + if (!crash_test_options.put_flips.empty()) { + flips.emplace_back( + crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]); } - if(!crash_test_options.remove_flips.empty()) { - flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ % crash_test_options.remove_flips.size()]); + if (!crash_test_options.remove_flips.empty()) { + flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ % + crash_test_options.remove_flips.size()]); } auto log_str = fmt::format("Step 1-{}: Set flag", round); - for(auto const& f : flips) { + for (auto const& f : flips) { log_str += fmt::format(" {}", f); this->set_basic_flip(f, 1, 100); } @@ -655,12 +661,12 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT file.close(); } } - + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); - + for (auto [k, op] : operations) { if (op == OperationType::Remove) { - if(num_keys < 1) { + if (num_keys < 1) { // remove flips and continue for (auto const& flip : flips) { this->remove_flip(flip); @@ -707,11 +713,12 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT print_time = true; } if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " - "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", - round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time, this->m_run_time, - elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), crash_test_options.num_entries, - this->tree_key_count() * 100.0 / crash_test_options.num_entries); + LOGINFO( + "\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time, + this->m_run_time, elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), + crash_test_options.num_entries, this->tree_key_count() * 100.0 / crash_test_options.num_entries); } // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; @@ -725,7 +732,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT }; // Crash recovery can test one simple btree, since focus is not on btree test itself, but index recovery -using BtreeTypes = testing::Types< FixedLenBtree >; +using BtreeTypes = testing::Types< FixedLenBtree, PrefixIntervalBtree >; TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes); TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { @@ -844,7 +851,7 @@ TYPED_TEST(IndexCrashTest, long_running_put_remove_crash) { long_running_crash_options crash_test_options{ .put_freq = 50, .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", - "crash_flush_on_split_at_right_child"}, + "crash_flush_on_split_at_right_child"}, .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" /*, "crash_flush_on_freed_child"*/}, }; From 79cf8b99bfd0f21c87ffc97903a418188723dff6 Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Fri, 4 Apr 2025 13:02:42 -0700 Subject: [PATCH 127/130] COWBtree and Multi Index support. (#1) This PR has following big changes 1. Introduce multiple index support, so that homestore can actually have different types of Index stores. 2. Introduce a new Btree called CopyOnWrite Btree, instead of inplace btree where the btree pages are not written in place, but on different location, but maintain a map. 3. Make the public interfaces to be very concise (having a BtreeBase and put that in the implementation) 4. Simplified the btree apis 5. Used latest sisl 13.x with REGISTER_LOG_MODS 6. Added cow btree crash test, updated other tests to ensure pass --- conanfile.py | 2 +- src/CMakeLists.txt | 16 +- src/include/homestore/btree/btree.hpp | 222 +++++++ src/include/homestore/btree/btree_base.hpp | 10 +- src/include/homestore/btree/btree_store.h | 2 - .../homestore/btree/detail/btree_node.hpp | 6 + .../index/inplace_btree/inplace_btree_store.h | 557 ++++++++++++++++-- 7 files changed, 746 insertions(+), 69 deletions(-) diff --git a/conanfile.py b/conanfile.py index 11f1691dc..75cca63cb 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.4" + version = "5.1.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c032ed95d..486314736 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -58,21 +58,17 @@ set(HOMESTORE_OBJECTS $ $ $ + $ + $ $ $ lib/homestore.cpp lib/crc.cpp ) #target_link_libraries(homestore_objs ${COMMON_DEPS}) -if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") - add_library(homestore STATIC - ${HOMESTORE_OBJECTS} - ) -else() - add_library(homestore STATIC - ${HOMESTORE_OBJECTS} - ) -endif() +add_library(homestore STATIC + ${HOMESTORE_OBJECTS} +) target_compile_definitions (homestore PRIVATE LOG_MODS_V2_SUPPORT) -target_link_libraries(homestore PRIVATE ${COMMON_DEPS}) +target_link_libraries(homestore ${COMMON_DEPS}) diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index ee65a8d0d..f1ab347d1 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -40,15 +41,28 @@ using PutPaginateCookie = unique< BtreeRangePutRequest< K > >; template < typename K > using RemovePaginateCookie = unique< BtreeRangeRemoveRequest< K > >; +template < typename K > +using QueryPaginateCookie = unique< BtreeQueryRequest< K > >; +class BtreeStore; + +template < typename K > +using PutPaginateCookie = unique< BtreeRangePutRequest< K > >; + +template < typename K > +using RemovePaginateCookie = unique< BtreeRangeRemoveRequest< K > >; + template < typename K > using QueryPaginateCookie = unique< BtreeQueryRequest< K > >; template < typename K, typename V > class Btree : public BtreeBase { +class Btree : public BtreeBase { public: /////////////////////////////////////// All External APIs ///////////////////////////// Btree(BtreeConfig const& cfg, uuid_t uuid = uuid_t{}, uuid_t parent_uuid = uuid_t{}, uint32_t user_sb_size = 0); Btree(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb); + Btree(BtreeConfig const& cfg, uuid_t uuid = uuid_t{}, uuid_t parent_uuid = uuid_t{}, uint32_t user_sb_size = 0); + Btree(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb); virtual ~Btree(); // Destroy the entire btree from persistent and from memory. It is to be noted that all blocks are not destroyed at @@ -146,6 +160,108 @@ class Btree : public BtreeBase { // expected to call put_range_next() again. Failing to do so will result in memory leak. btree_status_t put_range_next(PutPaginateCookie< K >& cookie); + // @brief Gets the value associated with the specified key from the B-tree. + // + // @param key The key to search for. + // @param out_val A pointer to store the value associated with the key. (Should be non-nullptr) + // + // @return The status of the get operation. + btree_status_t get_one(BtreeKey const& key, BtreeValue* out_val); + // Destroy the entire btree from persistent and from memory. It is to be noted that all blocks are not destroyed at + // one go. For persistent btree, it might be a staged operation on multiple checkpoints. + folly::Future< folly::Unit > destroy() override; + + // @brief Inserts or updates a key-value pair in the B-tree. + // + // This function inserts a new key-value pair or updates an existing key-value pair in the B-tree + // based on the specified put type. Optionally, it can return the existing value and apply a filter + // callback before insertion. + // + // @param key The key to be inserted or updated. + // @param value The value to be associated with the key. + // @param put_type The type of put operation (e.g., insert, update, upsert). + // @param existing_val Optional pointer to store the existing value prior to update if the key already exists. + // @param filter_cb Optional callback function to apply a filter before insertion. If provided, before putting, if + // an existing key-value pair is found, the filter callback is called with the existing key, value and the new + // value. The callback could return "replace" in that case the existing value is replaced with the new value or it + // could return "keep" in that case key is not modified. + // + // @return The status of the put operation. + // + btree_status_t put_one(BtreeKey const& key, BtreeValue const& value, btree_put_type put_type, + BtreeValue* existing_val = nullptr, put_filter_cb_t filter_cb = nullptr); + + // @brief Inserts or updates a range of key-value pairs in the B-tree. + // + // This function inserts a new range of key-value pairs or updates existing key-value pairs in the B-tree + // based on the specified put type. Optionally, it can return the existing value and apply a filter + // callback before insertion. + // + // This is an unique function which can be used for multiple purpose based on the key type. + // + // Interval Key Behavior: + // If the key is an interval key (which means can next_key be obtained by doing prev_key + 1), for example an + // integer keys. If the input range is provided for an interval key, example [1, 50), then it will behave the + // following way + // 1. If the put_type is INSERT and if a specific key in the interval range is not present in the btree, then it + // will insert it. + // + // 2. If the put_type is UPSERT, then it will insert the keys within the range for which there is no entry in the + // btree. However for keys that exist, it will call the filter_cb(key, current_value, new_value) if provided and + // expects the callback to return the decision. The decision could be + // a. replace - replace the existing value with the new value. Note that the new_value will also be added the + // same offset as the key. So if key range is [1. 50) and if the key is 10, then the value will be added at 10th + // of the original value provided (of course the shifting of 10 can be avoided by the caller by supplying a + // BtreeValue override which simply doesn't add) + // + // b. remove - remove the key from the btree and don't add the new value. This feature is useful when we use the + // btree to maintain multiple versions of the key and when we write the new version of the key, we need to + // remove the older versions of the key along with this write operation. + // + // c. keep - keep the existing value as is and don't add the new value. + // + // 3. If the put_type is UPDATE, then it will only act on keys which already exist and the behavior is identical to + // upsert case above when the key is present. + // + // Non-Interval Key Behavior: + // If the key is not an interval key, then only put_type = UPDATE is supported. It will walk through the keys within + // the range and then do a filter_cb(key, current_value, new_value) if provided and expects the callback to return + // the decision. The decision could be + // a. replace - replace the existing value with the new value for that key. + // b. remove - remove the key from the btree and don't update the new value. + // c. keep - keep the existing value as is and don't modify the key to new value. + // In this non-interval key case, the range of keys are all updated with the same value. + // + // About batch size: + // The batch size is the number of keys that will be processed in one go. It will return with btree_status::has_more + // and the caller is expected to call put_range_next() method with the cookie passed to resume the next batch until + // it returns btree_status::success. It is to be noted that, the batch size is a best effort from the btree and at + // any iteration it could put between 1 to batch_size keys (it will at least put one_key and at most batch_size keys + // per iteration). + // + // @param inp_range The range of keys to insert, upsert or update + // @param put_type The type of put operation (e.g., insert, update, upsert). + // @param value The value to be associated with the key. Behavior is different for interval and non-interval keys + // (see above) + // @param batch_size The number of keys to process in one go. Default is to attempt to process all keys in one go. + // Please see the note above about the batch size. + // @param filter_cb Optional callback function to apply a filter before insertion. (See above for details) + // + // @return The status of the put operation and a cookie, if it returns btree_status::has_more, then the caller is + // expected to call put_range_next() + std::pair< btree_status_t, PutPaginateCookie< K > > + put_range(BtreeKeyRange< K >&& inp_range, btree_put_type put_type, BtreeValue const& value, + uint32_t batch_size = std::numeric_limits< uint32_t >::max(), put_filter_cb_t filter_cb = nullptr); + + // @brief Continuation of the put_range call for the next batch of keys. Calling this method without calling + // put_range first returns error. + // + // @param cookie The cookie returned by the put_range call + // + // @return The status of the put operation and a cookie, if it returns btree_status::has_more, then the caller is + // expected to call put_range_next() again. Failing to do so will result in memory leak. + btree_status_t put_range_next(PutPaginateCookie< K >& cookie); + // @brief Gets the value associated with the specified key from the B-tree. // // @param key The key to search for. @@ -198,6 +314,50 @@ class Btree : public BtreeBase { btree_status_t query_next(QueryPaginateCookie< K >& cookie, std::vector< std::pair< K, V > >& out_kvs); + // @brief Gets any one value associated with the given key range. If the key range matches multiple keys, then btree + // will randomly pick one key and return the value associated with it. + // + // @param inp_range The range of keys to search for. + // @param out_key A pointer to store the picked key of the entry found. (Should be non-nullptr) + // @param out_val A pointer to store the value associated with the picked key. (Should be non-nullptr) + // + // @return The status of the get_any operation. + btree_status_t get_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val); + + // @brief Removes the key-value pair associated with the specified key from the B-tree. + // + // @param key The key to remove. + // @param out_val An optional pointer to store the value associated with the key before removal. + // + // @return The status of the remove operation. + btree_status_t remove_one(BtreeKey const& key, BtreeValue* out_val); + + // @brief Removes any one key-value pair associated with the given key range. If the key range matches multiple + // keys, then btree will randomly pick one key and remove the key-value pair associated with it. + // + // @param inp_range The range of keys to search for. + // @param out_key A pointer to store the picked key within the range. (Should be non-nullptr). Valid only if return + // status is btree_status_t::success. + // @param out_val A pointer to store the value associated with the picked key. (Should be non-nullptr) Valid only if + // return status is btree_status_t::success. + // + // @return The status of the remove_any operation. + btree_status_t remove_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val); + + std::pair< btree_status_t, RemovePaginateCookie< K > > + remove_range(BtreeKeyRange< K >&& inp_range, uint32_t batch_size = std::numeric_limits< uint32_t >::max(), + remove_filter_cb_t filter_cb = nullptr); + + btree_status_t remove_range_next(RemovePaginateCookie< K >& cookie); + + std::pair< btree_status_t, QueryPaginateCookie< K > > + query(BtreeKeyRange< K >&& inp_range, std::vector< std::pair< K, V > >& out_kvs, + uint32_t batch_size = std::numeric_limits< uint32_t >::max(), + BtreeQueryType query_type = BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, + get_filter_cb_t filter_cb = nullptr); + + btree_status_t query_next(QueryPaginateCookie< K >& cookie, std::vector< std::pair< K, V > >& out_kvs); + nlohmann::json get_status(int log_level) const; nlohmann::json get_metrics_in_json(bool updated); @@ -208,6 +368,17 @@ class Btree : public BtreeBase { std::string to_digraph_visualize_format() const; + void dump(const std::string& file, std::string format = "string", + BtreeNode::ToStringCallback< K, V > cb = nullptr) const; + + nlohmann::json get_metrics_in_json(bool updated); + + std::string to_string() const; + + std::string to_custom_string(BtreeNode::ToStringCallback< K, V > cb) const; + + std::string to_digraph_visualize_format() const; + void dump(const std::string& file, std::string format = "string", BtreeNode::ToStringCallback< K, V > cb = nullptr) const; @@ -215,6 +386,12 @@ class Btree : public BtreeBase { uint64_t count_keys(bnodeid_t start_bnodeid = empty_bnodeid) const; +private: + /////////////////////////////////// Mutate Impl methods ///////////////////////// + template < typename ReqT > + btree_status_t put(ReqT& put_req); + uint64_t count_keys(bnodeid_t start_bnodeid = empty_bnodeid) const; + private: /////////////////////////////////// Mutate Impl methods ///////////////////////// template < typename ReqT > @@ -240,12 +417,22 @@ class Btree : public BtreeBase { template < typename ReqT > btree_status_t get(ReqT& get_req); + template < typename ReqT > + btree_status_t do_get(const BtreeNodePtr& my_node, ReqT& greq); + K* out_split_key, CPContext* context); + + ///////////////////////////////// Get Impl Methods ///////////////////////////////// + template < typename ReqT > + btree_status_t get(ReqT& get_req); + template < typename ReqT > btree_status_t do_get(const BtreeNodePtr& my_node, ReqT& greq); + ///////////////////////////////// Remove Impl Methods ///////////////////////////////// ///////////////////////////////// Remove Impl Methods ///////////////////////////////// template < typename ReqT > btree_status_t remove(ReqT& rreq); + btree_status_t remove(ReqT& rreq); template < typename ReqT > btree_status_t do_remove(const BtreeNodePtr& my_node, locktype_t curlock, ReqT& rreq); @@ -253,16 +440,25 @@ class Btree : public BtreeBase { template < typename ReqT > btree_status_t check_collapse_root(ReqT& rreq); + template < typename ReqT > + btree_status_t check_collapse_root(ReqT& rreq); + btree_status_t merge_nodes(const BtreeNodePtr& parent_node, const BtreeNodePtr& leftmost_node, uint32_t start_indx, uint32_t end_indx, CPContext* context); + uint32_t end_indx, CPContext* context); + + ///////////////////////////////// Query Impl Methods ///////////////////////////////// + btree_status_t query(BtreeQueryRequest< K >& query_req, std::vector< std::pair< K, V > >& out_values); ///////////////////////////////// Query Impl Methods ///////////////////////////////// btree_status_t query(BtreeQueryRequest< K >& query_req, std::vector< std::pair< K, V > >& out_values); btree_status_t do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values); + std::vector< std::pair< K, V > >& out_values); btree_status_t do_traversal_query(const BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values); + std::vector< std::pair< K, V > >& out_values); #ifdef SERIALIZABLE_QUERY_IMPLEMENTATION btree_status_t do_serialzable_query(const BtreeNodePtr& my_node, BtreeSerializableQueryRequest& qreq, std::vector< std::pair< K, V > >& out_values); @@ -271,6 +467,32 @@ class Btree : public BtreeBase { std::vector< std::pair< K, V > >& out_values); #endif +private: + /////////////////////////////// Internal Node Management Methods //////////////////////////////////// + BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, + uint32_t ctx_size) const override; + + /////////////////////////////////// Helper Methods /////////////////////////////////////// + btree_status_t post_order_traversal(locktype_t acq_lock, const auto& cb); + btree_status_t post_order_traversal(const BtreeNodePtr& node, locktype_t acq_lock, const auto& cb); + void get_all_kvs(std::vector< std::pair< K, V > >& kvs) const; + uint64_t get_btree_node_cnt() const; + uint64_t get_child_node_cnt(bnodeid_t bnodeid) const; + void to_string_internal(bnodeid_t bnodeid, std::string& buf) const; + void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, + BtreeNode::ToStringCallback< K, V > const& cb) const; + void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, + std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; + void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; + void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; + void print_node(const bnodeid_t& bnodeid) const; + + void append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx = 0, + uint32_t end_idx = 0) const; + +protected: + mutable iomgr::FiberManagerLib::shared_mutex m_btree_lock; + std::atomic< bool > m_destroyed{false}; private: /////////////////////////////// Internal Node Management Methods //////////////////////////////////// // BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, diff --git a/src/include/homestore/btree/btree_base.hpp b/src/include/homestore/btree/btree_base.hpp index 937a5ed46..6fa04733b 100644 --- a/src/include/homestore/btree/btree_base.hpp +++ b/src/include/homestore/btree/btree_base.hpp @@ -27,7 +27,7 @@ class UnderlyingBtree { // Btree based implementations superblock area struct BtreeSuperBlock { static constexpr size_t underlying_btree_sb_size = - IndexSuperBlock::index_impl_sb_size - sizeof(bnodeid_t) - sizeof(uint64_t) - sizeof(uint32_t); + IndexSuperBlock::index_impl_sb_size - sizeof(bnodeid_t) - sizeof(uint32_t); bnodeid_t root_node_id{empty_bnodeid}; // Btree Root Node ID uint64_t root_link_version{0}; @@ -80,11 +80,8 @@ class BtreeBase : public Index { return const_cast< BtreeSuperBlock& >(s_cast< const BtreeBase* >(this)->bt_super_blk()); } - virtual BtreeNodePtr new_node(bnodeid_t id, bool is_leaf, BtreeNode::Allocator::Token token) const = 0; - virtual BtreeNodePtr load_node(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) const = 0; - - // virtual BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, - // BtreeNode::Allocator::Token token) const = 0; + virtual BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, + uint32_t ctx_size) const = 0; uint64_t space_occupied() const override; uint32_t ordinal() const override; @@ -104,7 +101,6 @@ class BtreeBase : public Index { protected: virtual btree_status_t create_root_node(); - virtual BtreeNodePtr clone_temp_node(BtreeNode const& node); virtual btree_status_t read_and_lock_node(bnodeid_t id, BtreeNodePtr& node_ptr, locktype_t int_lock_type, locktype_t leaf_lock_type, CPContext* context) const; virtual btree_status_t get_child_and_lock_node(const BtreeNodePtr& node, uint32_t index, BtreeLinkInfo& child_info, diff --git a/src/include/homestore/btree/btree_store.h b/src/include/homestore/btree/btree_store.h index 1c1b349c3..9864e8d78 100644 --- a/src/include/homestore/btree/btree_store.h +++ b/src/include/homestore/btree/btree_store.h @@ -19,11 +19,9 @@ class BtreeStore : public IndexStore { virtual unique< UnderlyingBtree > create_underlying_btree(BtreeBase& btree, bool load_existing) = 0; virtual folly::Future< folly::Unit > destroy_underlying_btree(BtreeBase& btree) = 0; -#if 0 // Called whenever a particular btree node has been freed. The underlying implementation could use this oppurtunity // to free any contexts stored for this node. virtual void on_node_freed(BtreeNode* node) = 0; -#endif // When a particular btree is to be destroyed, some stores can support fast destroy mechanism, where all the btree // nodes can be freed in one go (in a single Checkpoint) without merging the tree and collapsing the tree. This diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 8bf83966c..9fb0a8f2c 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -26,6 +26,12 @@ #include #include +#ifndef TEST_BNODE_ONLY +#include +#include +#include +#endif + namespace homestore { ENUM(locktype_t, uint8_t, NONE, READ, WRITE) diff --git a/src/lib/index/inplace_btree/inplace_btree_store.h b/src/lib/index/inplace_btree/inplace_btree_store.h index 2fe2e7c7f..b7ced1b2f 100644 --- a/src/lib/index/inplace_btree/inplace_btree_store.h +++ b/src/lib/index/inplace_btree/inplace_btree_store.h @@ -59,7 +59,28 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { superblk< index_table_sb > m_sb; shared< MetaIndexBuffer > m_sb_buffer; + // graceful shutdown +private: + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } + public: + void stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{"index"} { // Create a superblk for the index table and create MetaIndexBuffer corresponding to that @@ -100,10 +121,20 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - void destroy() override { + void audit_tree() override { + cp_mgr().cp_guard(); + Btree< K, V >::sanity_sub_tree(); + } + + btree_status_t destroy() override { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto cpg = cp_mgr().cp_guard(); Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); + m_sb_buffer->m_valid = false; + decr_pending_request_num(); + return btree_status_t::success; } uuid_t uuid() const override { return m_sb->uuid; } @@ -115,6 +146,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { template < typename ReqT > btree_status_t put(ReqT& put_req) { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto ret = btree_status_t::success; do { auto cpg = cp_mgr().cp_guard(); @@ -122,11 +155,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { ret = Btree< K, V >::put(put_req); if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); } } while (ret == btree_status_t::cp_mismatch); + decr_pending_request_num(); return ret; } template < typename ReqT > btree_status_t remove(ReqT& remove_req) { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto ret = btree_status_t::success; do { auto cpg = cp_mgr().cp_guard(); @@ -134,6 +170,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { ret = Btree< K, V >::remove(remove_req); if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); } } while (ret == btree_status_t::cp_mismatch); + decr_pending_request_num(); + return ret; + } + + template < typename ReqT > + btree_status_t get(ReqT& greq) const { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); + auto ret = Btree< K, V >::get(greq); + decr_pending_request_num(); return ret; } @@ -164,9 +210,22 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } + void delete_stale_children(IndexBufferPtr const& idx_buf) override { + BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); + static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); + auto cpg = cp_mgr().cp_guard(); + idx_buf->m_dirtied_cp_id = cpg->id(); + BtreeNodePtr bn = BtreeNodePtr{n}; + + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "delete_stale_links cp={} buf={}", cpg->id(), idx_buf->to_string()); + delete_stale_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + } + } + } + void repair_node(IndexBufferPtr const& idx_buf) override { if (idx_buf->is_meta_buf()) { - // We cannot repair the meta buf on its own, we need to repair the root node which modifies the // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is // attempted to repair, which would have updated the meta_buf already. LOGTRACEMOD(wbcache, "Ignoring repair on meta buf {} root id {} ", idx_buf->to_string(), @@ -213,6 +272,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { node->set_checksum(); auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY); + idx_node->m_idx_buf->m_node_level = node->level(); if (prev_state == index_buf_state_t::CLEAN) { // It was clean before, dirtying it first time, add it to the wb_cache list to flush if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) { @@ -226,9 +286,9 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { (int)prev_state, (int)index_buf_state_t::FLUSHING, "Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}", cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id); + BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(), } return btree_status_t::success; - } btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes, const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, @@ -277,14 +337,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { void free_node_impl(const BtreeNodePtr& node, void* context) override { auto n = static_cast< IndexBtreeNode* >(node.get()); + n->m_idx_buf->m_node_level = node->level(); wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ // return btree_status_t::success;} - LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, - new_root->node_id()); + LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, new_root->node_id()); m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); @@ -298,11 +358,111 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } + btree_status_t delete_stale_links(BtreeNodePtr const& parent_node, void* cp_ctx) { + LOGTRACEMOD(wbcache, "deleting stale links for parent node [{}]", parent_node->to_string()); + BtreeNodeList free_nodes; + auto nentries = parent_node->total_entries(); + uint32_t deleted = 0; + for (uint32_t i = nentries; i-- > 0;) { + BtreeLinkInfo cur_child_info; + BtreeNodePtr child_node; + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), child_node); ret == btree_status_t::success) { + if (child_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale child node [{}] for parent node [{}]", child_node->to_string(), + parent_node->to_string()); + child_node->set_node_deleted(); + free_node_impl(child_node, cp_ctx); + + if (i > 0) { + BtreeLinkInfo pre_child_info; + parent_node->get_nth_value(i - 1, &pre_child_info, false /* copy */); + // auto ckey = parent_node->get_nth_key< K >(i-1, true); + // parent_node->set_nth_key(i-1, ckey); + parent_node->update(i, pre_child_info); + parent_node->remove(i - 1); + } else { + parent_node->remove(i); + } + + LOGTRACEMOD(wbcache, "so far parent node [{}]", parent_node->to_string()); + // free_nodes.push_back(child_node); + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + cur_child_info.bnode_id(), parent_node->to_string(), ret); + } + } + if (parent_node->has_valid_edge()) { + auto edge_info = parent_node->get_edge_value(); + BtreeNodePtr edge_node; + if (auto ret = read_node_impl(edge_info.bnode_id(), edge_node); ret == btree_status_t::success) { + if (edge_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale edge node [{}] for parent node [{}]", edge_node->to_string(), + parent_node->to_string()); + edge_node->set_node_deleted(); + free_node_impl(edge_node, cp_ctx); + if (parent_node->total_entries() == 0) { + parent_node->invalidate_edge(); + } else { + BtreeLinkInfo last_child_info; + parent_node->get_nth_value(parent_node->total_entries() - 1, &last_child_info, + false /* copy */); + parent_node->set_edge_value(last_child_info); + parent_node->remove(parent_node->total_entries() - 1); + LOGTRACEMOD(wbcache, "Replacing edge with previous child node [{}] for parent node [{}]", + last_child_info.bnode_id(), parent_node->to_string()); + } + + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read edge node {} for parent node [{}] reason {}", + edge_node->to_string(), parent_node->to_string(), ret); + } + } + if (deleted /*free_nodes.size()*/) { + btree_status_t ret = btree_status_t::success; + + if ((parent_node->total_entries() == 0) && !parent_node->has_valid_edge()) { + parent_node->set_node_deleted(); + LOGTRACEMOD(wbcache, + "Freeing parent node=[{}] because it is empty and not an edge node but had stale children", + parent_node->to_string()); + ret = write_node_impl(parent_node, cp_ctx); + free_node_impl(parent_node, cp_ctx); + LOGTRACEMOD(wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } else { + ret = write_node_impl(parent_node, cp_ctx); + if (ret != btree_status_t::success) { + LOGTRACEMOD(wbcache, "Failed to write parent node [{}] after deleting stale links", + parent_node->to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } + } + // auto ret = transact_nodes({}, free_nodes, parent_node, nullptr, cp_ctx); + return ret; + } else { + LOGTRACEMOD(wbcache, "Accomplishing deleting stale links. No stale links found for parent node [{}]", + parent_node->to_string()); + } + return btree_status_t::success; + } + + // btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string()); - // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs - // to be handled. Get the last key in the node - auto const last_parent_key = parent_node->get_last_key< K >(); + LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this + // needs to be handled. Get the last key in the node + + auto last_parent_key = parent_node->get_last_key< K >(); auto const is_parent_edge_node = parent_node->has_valid_edge(); if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", @@ -311,18 +471,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Get all original child ids as a support to check if we are beyond the last child node - std::set< bnodeid_t > orig_child_ids; + std::unordered_map< bnodeid_t, K > orig_child_infos; for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { BtreeLinkInfo link_info; parent_node->get_nth_value(i, &link_info, true); - orig_child_ids.insert(link_info.bnode_id()); + orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); } - BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), - last_parent_key.to_string()); + LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), + last_parent_key.to_string()); // Get the first child node and its link info BtreeLinkInfo child_info; BtreeNodePtr child_node; + BtreeNodePtr pre_child_node; auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); if (ret != btree_status_t::success) { @@ -331,12 +492,125 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return ret; } + // update the last key of parent for issue + // start from first child and store the last key of the child node, then traverse to next sibling + // 2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to + // siblings which has keys more than Y or end of list (name this parent sibling node F), + // 2-2- Put last key of F to last key of P + // 2-3 - set F as Next of A + BtreeNodeList siblings; + BtreeNodePtr next_cur_child; + BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), + "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); + if (parent_node->total_entries() > 0) { + auto updated_last_key = last_parent_key; + K last_child_last_key; + K last_child_neighbor_key; + BtreeNodePtr cur_child; + BtreeLinkInfo cur_child_info; + + bool found_child = false; + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted() && cur_child->total_entries()) { + last_child_last_key = cur_child->get_last_key< K >(); + if (cur_child->next_bnode() != empty_bnodeid && + read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { + LOGTRACEMOD( + wbcache, + "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", + last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), + next_cur_child->to_string()); + found_child = true; + break; + } + found_child = true; + break; + } + LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", + cur_child->to_string()); + } + } + + if (found_child) { + LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", + last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); + if (last_child_last_key.compare(last_parent_key) > 0) { + if (next_cur_child) { + last_child_neighbor_key = next_cur_child->get_last_key< K >(); + LOGTRACEMOD(wbcache, + "Voila !! last child_key of child [{}] is greater than its parents [{}] and its " + "next neighbor key is {}", + cur_child->to_string(), parent_node->to_string(), + last_child_neighbor_key.to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Last child_key of child [{}] is greater than its parents [{}] and it has no next neighbor", + cur_child->to_string(), parent_node->to_string()); + } + + // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 + // or end + // of list (put all siblings in a list, here is F) , + BtreeNodePtr sibling; + BtreeNodePtr true_sibling; + BtreeLinkInfo sibling_info; + + auto sibling_node_id = parent_node->next_bnode(); + while (sibling_node_id != empty_bnodeid) { + if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { + if (sibling->is_node_deleted()) { + // Do we need to free the sibling node here? + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", + sibling->to_string()); + continue; + } + auto sibling_last_key = sibling->get_last_key< K >(); + if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + } else { + true_sibling = sibling; + break; + } + } + } + if (true_sibling) { + LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(), + parent_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string()); + } + if (sibling_node_id != empty_bnodeid) { + last_parent_key = last_child_last_key; + parent_node->set_next_bnode(true_sibling->node_id()); + for (auto sibling : siblings) { + LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); + } + LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); + BtreeLinkInfo first_child_info; + parent_node->get_nth_value(0, &first_child_info, false); + } + } else { + LOGTRACEMOD(wbcache, + "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)", + parent_node->to_string()); + next_cur_child = nullptr; + } + } + } + // Keep a copy of the node buffer, in case we need to revert back uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; - std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); // Remove all the entries in parent_node and let walk across child_nodes rebuild this node - parent_node->remove_all(); + parent_node->remove_all(this->m_bt_cfg); // Walk across all child nodes until it gets the last_parent_key and keep fixing them. auto cur_parent = parent_node; @@ -350,35 +624,101 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); cur_parent->remove(cur_parent->total_entries() - 1); cur_parent->set_edge_value(prev_val); - BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", - cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + LOGTRACEMOD(wbcache, + "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); } else { - BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted", - cur_parent->node_id()); + LOGTRACEMOD(wbcache, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); } } else { // Update edge and finish - BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(), - child_node->to_string()); - child_node->set_next_bnode(empty_bnodeid); - write_node_impl(child_node, cp_ctx); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (is_parent_edge_node) { + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } else { + auto tsib_id = find_true_sibling(cur_parent); + if (tsib_id != empty_bnodeid) { + cur_parent->set_next_bnode(tsib_id); + LOGTRACEMOD(wbcache, + "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", + tsib_id, cur_parent->to_string(), child_node->to_string()); + } else { + cur_parent->set_next_bnode(empty_bnodeid); + // if this child node previously belonged to this parent node, we need to add it but as edge + // o.w, not this node + if (orig_child_infos.contains(child_node->node_id())) { + cur_parent->set_edge_value( + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + LOGTRACEMOD(wbcache, + "Child node [{}] is an edge node and previously belong to this parent, so " + "we need to add it as edge", + child_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", + cur_parent->to_string()); + } + BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), + "Parent node [{}] cannot be empty", cur_parent->to_string()); + } + } + + // + // } + break; } break; } - auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), - child_node->to_string(), child_last_key.to_string()); + auto child_last_key = child_node->get_last_key< K >(); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), + child_node->to_string(), child_last_key.to_string()); // Check if we are beyond the last child node. // - // There can be cases where the child level merge is successfully persisted but the parent level is not. - // In this case, you may have your rightmost child node with last key greater than the last_parent_key. - // That's why here we have to check if the child node is one of the original child nodes first. - if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) { - if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) { + // There can be cases where the child level merge is successfully persisted but the parent level is + // not. In this case, you may have your rightmost child node with last key greater than the + // last_parent_key. That's why here we have to check if the child node is one of the original child + // nodes first. + if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { + if (child_last_key.compare(last_parent_key) > 0) { // We have reached a child beyond this parent, we can stop now + // TODO this case if child last key is less than last parent key to update the parent node. + // this case can potentially break the btree for put and remove op. + break; + } + if (child_node->total_entries() == 0) { + // this child has no entries, but maybe in the middle of the parent node, we need to update the key + // of parent as previous one and go on + LOGTRACEMOD(wbcache, + "Reach to an empty child node {}, and this child doesn't belong to this parent; Hence " + "loop ends", + child_node->to_string()); + // now update the next of parent node by skipping all deleted siblings of this parent node + auto valid_sibling = cur_parent->next_bnode(); + while (valid_sibling != empty_bnodeid) { + BtreeNodePtr sibling; + if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) { + if (sibling->is_node_deleted()) { + valid_sibling = sibling->next_bnode(); + continue; + } + // cur_parent->set_next_bnode(sibling->node_id()); + break; + } + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + valid_sibling, cur_parent->to_string(), ret); + } + if (valid_sibling != empty_bnodeid) { + cur_parent->set_next_bnode(valid_sibling); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + + } else { + cur_parent->set_next_bnode(empty_bnodeid); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + } + break; } } @@ -403,26 +743,61 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // Insert the last key of the child node into parent node if (!child_node->is_node_deleted()) { - cur_parent->insert(cur_parent->total_entries(), - child_node->total_entries() > 0 ? child_last_key : last_parent_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); if (child_node->total_entries() == 0) { - // There should be at most one empty child node per parent - if we find one, we should stop here - BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(), - child_node->to_string()); - break; + if (orig_child_infos.contains(child_node->node_id())) { + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}], but not the end of the parent node, so we need " + "to update the key of parent node as original one {}", + child_node->to_string(), child_last_key.to_string()); + } else { + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}] but not belonging to this parent (probably next " + "parent sibling); Hence end loop", + child_node->to_string()); + break; + } } + cur_parent->insert(cur_parent->total_entries(), child_last_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); } else { // Node deleted indicates it's freed & no longer used during recovery - BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert", - cur_parent->node_id(), child_node->to_string()); + LOGTRACEMOD(wbcache, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + if (pre_child_node) { + // We need to update the next of the previous child node to this child node + + LOGTRACEMOD(wbcache, + "Repairing node={}, child_node=[{}] is deleted, set next of previous child node [{}] " + "to this child node [{}]", + cur_parent->node_id(), child_node->to_string(), pre_child_node->to_string(), + child_node->next_bnode()); + pre_child_node->set_next_bnode(child_node->next_bnode()); + // repairing the next of previous child node + // We need to set the state of the previous child node to clean, so that it can be flushed + IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get()); + idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + write_node_impl(pre_child_node, cp_ctx); + // update the key of last entry of the parent with the last key of deleted child + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(), + child_last_key.to_string()); + // update it here to go to the next child node and unlock this node + LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node"); + child_node->set_next_bnode(child_node->next_bnode()); + } } - BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string()); + LOGTRACEMOD(wbcache, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), + cur_parent->to_string()); // Move to the next child node auto const next_node_id = child_node->next_bnode(); this->unlock_node(child_node, locktype_t::READ); + if (!child_node->is_node_deleted()) { + // We need to free the child node + pre_child_node = child_node; + } if (next_node_id == empty_bnodeid) { // This can be a deleted edge node - only check if it is still valid if (!child_node->is_node_deleted()) { @@ -435,7 +810,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { child_node = nullptr; break; } - + if (next_cur_child && next_node_id == next_cur_child->node_id()) { + // We are at the last child node, we can stop now + LOGTRACEMOD( + wbcache, + "REACH Repairing node={}, child_node=[{}] is the true child of sibling parent; Hence, end loop", + child_node->node_id(), next_cur_child->to_string()); + child_node = nullptr; + break; + } ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); if (ret != btree_status_t::success) { BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", @@ -443,18 +826,51 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { child_node = nullptr; break; } + } while (true); if (child_node) { this->unlock_node(child_node, locktype_t::READ); } - - if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { - // We shouldn't have an empty interior node in the tree, let's delete it. - // The buf will be released by the caller - BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id()); - parent_node->set_node_deleted(); + // if last parent has the key less than the last child key, then we need to update the parent node with + // the last child key if it doesn't have edge. + auto last_parent = parent_node; + if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } + if (last_parent->total_entries() && !last_parent->has_valid_edge()) { + if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { + BtreeLinkInfo child_info; + last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); + parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); + LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", + parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); + } + // if last key of children is less than the last key of parent, then we need to update the last key of non + // interior child + if (last_parent->level() > 1 && !last_parent->has_valid_edge()) { + // read last child + BtreeNodePtr last_child; + BtreeLinkInfo child_info; + auto total_entries = last_parent->total_entries(); + last_parent->get_nth_value(total_entries - 1, &child_info, false /* copy */); + if (ret = read_node_impl(child_info.bnode_id(), last_child); ret == btree_status_t::success) { + // get last key of cur child + auto last_child_key = last_child->get_last_key< K >(); + BtreeLinkInfo last_child_info; + last_child->get_nth_value(last_child->total_entries() - 1, &last_child_info, false /* copy*/); + if (last_parent->compare_nth_key(last_child_key, total_entries - 1) > 0) { + auto cur_child_st = last_child->to_string(); + last_child->update(last_child->total_entries() - 1, last_parent_key, last_child_info); + LOGTRACEMOD(wbcache, + "Updating interior child node={} with last_parent_key={} and child_info={}", + cur_child_st, last_parent_key.to_string(), last_child_info.to_string()); + write_node_impl(last_child, cp_ctx); + } + } + } } if (ret == btree_status_t::success) { + // Make write_buf happy for the parent node in case of multiple write (stale repair and link repair) + IndexBtreeNode* p_node = static_cast< IndexBtreeNode* >(parent_node.get()); + p_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); } @@ -467,6 +883,49 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { delete[] tmp_buffer; return ret; } + + bnodeid_t find_true_sibling(BtreeNodePtr const& node) { + if (node == nullptr) return empty_bnodeid; + bnodeid_t sibling_id = empty_bnodeid; + if (node->has_valid_edge()) { + sibling_id = node->get_edge_value().bnode_id(); + } else { + sibling_id = node->next_bnode(); + } + if (sibling_id == empty_bnodeid) { + return empty_bnodeid; + } else { + BtreeNodePtr sibling_node; + if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; } + + if (sibling_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", + sibling_node->to_string(), node->to_string()); + return find_true_sibling(sibling_node); + } else { + return sibling_id; + } + } + return sibling_id; + } + + K get_last_true_child_key(BtreeNodePtr const& parent_node) { + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + BtreeLinkInfo cur_child_info; + BtreeNodePtr cur_child; + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted()) { + if (cur_child->total_entries()) { + return cur_child->get_last_key< K >(); + } else { + LOGTRACEMOD(wbcache, "Last valid child {} has no entries", cur_child->to_string()); + } + } + } + } + } }; } // namespace homestore From baf4ef2d07c49633024e65497150a6b62b1eb73e Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Tue, 27 May 2025 08:44:53 -0700 Subject: [PATCH 128/130] Improvements2 (#4) This PR has following big changes * COWBtree recovery test cases with variable cps and fixes * Added cow btree crash test, updated other tests to ensure pass * Btree Node allocators and variants * Multiple Btreenode fixes --- .github/workflows/build_commit.yml | 3 +++ .github/workflows/build_dependencies.yml | 19 ++++++++++--------- conanfile.py | 2 +- src/include/homestore/btree/btree.hpp | 6 ++++-- src/include/homestore/btree/btree_base.hpp | 8 ++++++-- src/include/homestore/btree/btree_store.h | 2 ++ .../homestore/btree/detail/btree_node.hpp | 6 ------ .../index/inplace_btree/inplace_btree_store.h | 2 +- 8 files changed, 27 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build_commit.yml b/.github/workflows/build_commit.yml index 8f959775b..9de4ca838 100644 --- a/.github/workflows/build_commit.yml +++ b/.github/workflows/build_commit.yml @@ -18,6 +18,9 @@ on: tooling: required: true type: string + build-level: + required: true + type: string jobs: HomestorePRBuild: diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 4c92419ec..5638d641c 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -23,10 +23,10 @@ on: required: false type: string default: 'None' - testing: + build-level: required: false type: string - default: 'False' + default: 'Packaging' workflow_dispatch: inputs: platform: @@ -71,14 +71,15 @@ on: - 'Coverage' - 'None' default: 'None' - testing: - description: 'Build and Run' + build-level: + description: 'Level to build' required: true type: choice options: - - 'True' - - 'False' - default: 'True' + - 'Dependencies' + - 'Packaging' + - 'Testing' + default: 'Testing' jobs: BuildHomestore: @@ -97,14 +98,14 @@ jobs: uses: actions/checkout@main with: ref: ${{ inputs.branch }} - if: ${{ inputs.testing == 'True' }} + if: ${{ inputs.build-level == 'Testing' }} - name: Retrieve Recipe uses: actions/checkout@main with: repository: hkadayam/Homestore ref: ${{ inputs.branch }} - if: ${{ inputs.testing == 'False' }} + if: ${{ inputs.build-level != 'Testing' }} - name: Restore Dependency Cache id: restore-cache diff --git a/conanfile.py b/conanfile.py index 75cca63cb..457731905 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "5.1.1" + version = "5.2.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index f1ab347d1..f10754a2a 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -469,8 +469,10 @@ class Btree : public BtreeBase { private: /////////////////////////////// Internal Node Management Methods //////////////////////////////////// - BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, - uint32_t ctx_size) const override; + // BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, + // BtreeNode::Allocator::Token token) const override; + virtual BtreeNodePtr new_node(bnodeid_t id, bool is_leaf, BtreeNode::Allocator::Token token) const override; + virtual BtreeNodePtr load_node(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) const override; /////////////////////////////////// Helper Methods /////////////////////////////////////// btree_status_t post_order_traversal(locktype_t acq_lock, const auto& cb); diff --git a/src/include/homestore/btree/btree_base.hpp b/src/include/homestore/btree/btree_base.hpp index 6fa04733b..95b0ee043 100644 --- a/src/include/homestore/btree/btree_base.hpp +++ b/src/include/homestore/btree/btree_base.hpp @@ -80,8 +80,11 @@ class BtreeBase : public Index { return const_cast< BtreeSuperBlock& >(s_cast< const BtreeBase* >(this)->bt_super_blk()); } - virtual BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, - uint32_t ctx_size) const = 0; + virtual BtreeNodePtr new_node(bnodeid_t id, bool is_leaf, BtreeNode::Allocator::Token token) const = 0; + virtual BtreeNodePtr load_node(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) const = 0; + + // virtual BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, + // BtreeNode::Allocator::Token token) const = 0; uint64_t space_occupied() const override; uint32_t ordinal() const override; @@ -101,6 +104,7 @@ class BtreeBase : public Index { protected: virtual btree_status_t create_root_node(); + virtual BtreeNodePtr clone_temp_node(BtreeNode const& node); virtual btree_status_t read_and_lock_node(bnodeid_t id, BtreeNodePtr& node_ptr, locktype_t int_lock_type, locktype_t leaf_lock_type, CPContext* context) const; virtual btree_status_t get_child_and_lock_node(const BtreeNodePtr& node, uint32_t index, BtreeLinkInfo& child_info, diff --git a/src/include/homestore/btree/btree_store.h b/src/include/homestore/btree/btree_store.h index 9864e8d78..1c1b349c3 100644 --- a/src/include/homestore/btree/btree_store.h +++ b/src/include/homestore/btree/btree_store.h @@ -19,9 +19,11 @@ class BtreeStore : public IndexStore { virtual unique< UnderlyingBtree > create_underlying_btree(BtreeBase& btree, bool load_existing) = 0; virtual folly::Future< folly::Unit > destroy_underlying_btree(BtreeBase& btree) = 0; +#if 0 // Called whenever a particular btree node has been freed. The underlying implementation could use this oppurtunity // to free any contexts stored for this node. virtual void on_node_freed(BtreeNode* node) = 0; +#endif // When a particular btree is to be destroyed, some stores can support fast destroy mechanism, where all the btree // nodes can be freed in one go (in a single Checkpoint) without merging the tree and collapsing the tree. This diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 9fb0a8f2c..8bf83966c 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -26,12 +26,6 @@ #include #include -#ifndef TEST_BNODE_ONLY -#include -#include -#include -#endif - namespace homestore { ENUM(locktype_t, uint8_t, NONE, READ, WRITE) diff --git a/src/lib/index/inplace_btree/inplace_btree_store.h b/src/lib/index/inplace_btree/inplace_btree_store.h index b7ced1b2f..63e141bda 100644 --- a/src/lib/index/inplace_btree/inplace_btree_store.h +++ b/src/lib/index/inplace_btree/inplace_btree_store.h @@ -610,7 +610,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; // Remove all the entries in parent_node and let walk across child_nodes rebuild this node - parent_node->remove_all(this->m_bt_cfg); + parent_node->remove_all(); // Walk across all child nodes until it gets the last_parent_key and keep fixing them. auto cur_parent = parent_node; From 8f1649c2709d742abe7c03ce644432c0dfa5ff67 Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Tue, 27 May 2025 09:14:32 -0700 Subject: [PATCH 129/130] Update build_commit.yml to do merge build --- .github/workflows/build_commit.yml | 3 --- .github/workflows/build_dependencies.yml | 21 +++++++++++---------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build_commit.yml b/.github/workflows/build_commit.yml index 9de4ca838..8f959775b 100644 --- a/.github/workflows/build_commit.yml +++ b/.github/workflows/build_commit.yml @@ -18,9 +18,6 @@ on: tooling: required: true type: string - build-level: - required: true - type: string jobs: HomestorePRBuild: diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 5638d641c..12d2093de 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -23,10 +23,10 @@ on: required: false type: string default: 'None' - build-level: + testing: required: false type: string - default: 'Packaging' + default: 'False' workflow_dispatch: inputs: platform: @@ -71,15 +71,14 @@ on: - 'Coverage' - 'None' default: 'None' - build-level: - description: 'Level to build' + testing: + description: 'Build and Run' required: true type: choice options: - - 'Dependencies' - - 'Packaging' - - 'Testing' - default: 'Testing' + - 'True' + - 'False' + default: 'True' jobs: BuildHomestore: @@ -98,14 +97,14 @@ jobs: uses: actions/checkout@main with: ref: ${{ inputs.branch }} - if: ${{ inputs.build-level == 'Testing' }} + if: ${{ inputs.testing == 'True' }} - name: Retrieve Recipe uses: actions/checkout@main with: repository: hkadayam/Homestore ref: ${{ inputs.branch }} - if: ${{ inputs.build-level != 'Testing' }} + if: ${{ inputs.testing == 'False' }} - name: Restore Dependency Cache id: restore-cache @@ -212,6 +211,8 @@ jobs: - name: Code Coverage Run run: | + du -sh ~/.conan2/p/* + df -h conan build \ -o "sisl/*:prerelease=${{ inputs.prerelease }}" \ -o "sisl/*:malloc_impl=${{ inputs.malloc-impl }}" \ From 14156c0660071a8723f417401f066bd2a3307794 Mon Sep 17 00:00:00 2001 From: Hari Kadayam Date: Thu, 7 Aug 2025 15:43:32 -0700 Subject: [PATCH 130/130] Merge with main fork and also made replication as an optional support --- CMakeLists.txt | 12 + cmake/test_mode.cmake | 6 + conanfile.py | 11 +- src/CMakeLists.txt | 35 ++- src/include/homestore/blk.h | 14 +- src/include/homestore/blkdata_service.hpp | 9 - src/include/homestore/btree/btree.hpp | 226 +----------------- src/include/homestore/btree/btree_base.hpp | 2 +- .../btree/node_variant/prefix_node.hpp | 121 ++-------- .../btree/node_variant/variant_node.hpp | 1 - src/include/homestore/homestore.hpp | 6 + src/include/homestore/replication/repl_dev.h | 2 + src/lib/blkdata_svc/blkdata_service.cpp | 24 +- src/lib/common/resource_mgr.cpp | 6 +- src/lib/device/journal_vdev.cpp | 2 - src/lib/homestore.cpp | 10 + src/lib/logstore/log_dev.cpp | 2 + src/lib/logstore/log_store_service.cpp | 32 ++- .../repl_dev/raft_state_machine.cpp | 2 +- .../replication/repl_dev/solo_repl_dev.cpp | 40 ++-- .../replication/service/generic_repl_svc.cpp | 11 +- .../replication/service/raft_repl_service.cpp | 24 +- src/tests/CMakeLists.txt | 55 +++-- src/tests/test_btree_node.cpp | 12 +- .../test_common/homestore_test_common.hpp | 2 + 25 files changed, 214 insertions(+), 453 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e90a498b..728a2bdbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,18 @@ endif () add_flags("-DPACKAGE_NAME=\\\"${PROJECT_NAME}\\\"") add_flags("-DPACKAGE_VERSION=\\\"${PACKAGE_REVISION}\\\"") +# add replication flag +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + add_flags("-DREPLICATION_SUPPORT") + message(STATUS "Building with REPLICATION enabled") + else() + message(STATUS "Building with REPLICATION disabled") + endif() +else() + message(STATUS "Building with REPLICATION disabled") +endif() + if(UNIX) # enable proper pread/pwrite and large file add_flags("-D_POSIX_C_SOURCE=200809L -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE") diff --git a/cmake/test_mode.cmake b/cmake/test_mode.cmake index 486186bd5..4195a68b1 100644 --- a/cmake/test_mode.cmake +++ b/cmake/test_mode.cmake @@ -39,6 +39,9 @@ if (DEFINED TEST_TARGET) set(${ret} true) endif() endmacro() + macro(can_build_repl_tests ret) + set(${ret} false) + endmacro() else() macro(can_build_io_tests ret) set(${ret} false) @@ -55,4 +58,7 @@ else() macro(can_build_epoll_io_tests ret) set(${ret} false) endmacro() + macro(can_build_repl_tests ret) + set(${ret} false) + endmacro() endif() diff --git a/conanfile.py b/conanfile.py index 457731905..fab1039da 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "5.2.1" + version = "5.3.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -25,6 +25,7 @@ class HomestoreConan(ConanFile): "coverage": ['True', 'False'], "sanitize": ['True', 'False'], "testing" : ['full', 'min', 'off', 'epoll_mode', 'spdk_mode'], + "replication" : ['off', 'on'], } default_options = { 'shared': False, @@ -32,6 +33,7 @@ class HomestoreConan(ConanFile): 'coverage': False, 'sanitize': False, 'testing': 'epoll_mode', + 'replication': 'off', } exports_sources = "cmake/*", "src/*", "CMakeLists.txt", "test_wrap.sh", "LICENSE" @@ -54,7 +56,8 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^12.1]@oss/master", transitive_headers=True) self.requires("sisl/[^13.3]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[^4.1]@oss/main", transitive_headers=True) + if str(self.options.replication) == "on": + self.requires("nuraft_mesg/[^4.1]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: @@ -104,6 +107,10 @@ def generate(self): tc.variables['MEMORY_SANITIZER_ON'] = 'ON' tc.variables["CONAN_PACKAGE_NAME"] = self.name tc.variables["CONAN_PACKAGE_VERSION"] = self.version + if str(self.options.replication) == "on": + tc.variables["REPLICATION"] = "ON" + else: + tc.variables["REPLICATION"] = "OFF" tc.generate() # This generates "boost-config.cmake" and "grpc-config.cmake" etc in self.generators_folder diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 486314736..7b33a68e8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,15 +8,27 @@ find_package(isa-l QUIET) find_package(iomgr QUIET REQUIRED) find_package(farmhash QUIET REQUIRED) find_package(GTest QUIET REQUIRED) -find_package(NuraftMesg QUIET REQUIRED) +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + find_package(NuraftMesg QUIET REQUIRED) + endif() +endif() list(APPEND COMMON_DEPS iomgr::iomgr farmhash::farmhash - nuraft_mesg::proto - nuraft::nuraft sisl::sisl ) + +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + list(APPEND COMMON_DEPS + nuraft_mesg::proto + nuraft::nuraft + ) + endif() +endif() + if (${isa-l_FOUND}) list(APPEND COMMON_DEPS isa-l::isa-l) else () @@ -42,7 +54,11 @@ add_subdirectory(lib/logstore) add_subdirectory(lib/meta) add_subdirectory(lib/index) add_subdirectory(lib/blkdata_svc/) -add_subdirectory(lib/replication/) +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + add_subdirectory(lib/replication/) + endif() +endif() if(NOT DEFINED BUILD_TESTING OR BUILD_TESTING) add_subdirectory(tests) @@ -58,17 +74,20 @@ set(HOMESTORE_OBJECTS $ $ $ - $ - $ $ - $ lib/homestore.cpp lib/crc.cpp ) + +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + list(APPEND HOMESTORE_OBJECTS $) + endif() +endif() #target_link_libraries(homestore_objs ${COMMON_DEPS}) add_library(homestore STATIC ${HOMESTORE_OBJECTS} ) target_compile_definitions (homestore PRIVATE LOG_MODS_V2_SUPPORT) -target_link_libraries(homestore ${COMMON_DEPS}) +target_link_libraries(homestore PRIVATE ${COMMON_DEPS}) diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index 96cec5272..a3e0a7768 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -251,12 +251,14 @@ VENUM(BlkAllocStatus, uint32_t, struct blk_alloc_hints { blk_temp_t desired_temp{0}; // Temperature hint for the device - std::optional< uint32_t > reserved_blks; // Reserved blks in a chunk - std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care - std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation - std::optional committed_blk_id; // blk id indicates the blk was already allocated and committed, don't allocate and commit again - std::optional< stream_id_t > stream_id_hint; // any specific stream to pick - std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk + std::optional< uint32_t > reserved_blks{std::nullopt}; // Reserved blks in a chunk + std::optional< uint32_t > pdev_id_hint{std::nullopt}; // which physical device to pick (hint if any) + std::optional< chunk_num_t > chunk_id_hint{std::nullopt}; // any specific chunk id to pick for this allocation + std::optional< MultiBlkId > committed_blk_id{ + std::nullopt}; // blk id indicates the blk was already allocated and committed, don't allocate and commit again + std::optional< stream_id_t > stream_id_hint{std::nullopt}; // any specific stream to pick + std::optional< uint64_t > application_hint{ + std::nullopt}; // hints in uint64 what will be passed opaque to select_chunk bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device bool is_contiguous{true}; // Should the entire allocation be one contiguous block bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index 1147b169f..69b2f2ee4 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -254,15 +254,6 @@ class BlkDataService { */ HSDevType get_dev_type() const; - /** - * @brief Gets the drive type of the data service. - * - * Data Service doesn't support mixed drive types. - * - * @return The drive type of the data service, HDD or NVME. - */ - HSDevType get_dev_type() const; - void stop(); private: diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index f10754a2a..c159dc2f3 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -41,28 +40,15 @@ using PutPaginateCookie = unique< BtreeRangePutRequest< K > >; template < typename K > using RemovePaginateCookie = unique< BtreeRangeRemoveRequest< K > >; -template < typename K > -using QueryPaginateCookie = unique< BtreeQueryRequest< K > >; -class BtreeStore; - -template < typename K > -using PutPaginateCookie = unique< BtreeRangePutRequest< K > >; - -template < typename K > -using RemovePaginateCookie = unique< BtreeRangeRemoveRequest< K > >; - template < typename K > using QueryPaginateCookie = unique< BtreeQueryRequest< K > >; template < typename K, typename V > class Btree : public BtreeBase { -class Btree : public BtreeBase { public: /////////////////////////////////////// All External APIs ///////////////////////////// Btree(BtreeConfig const& cfg, uuid_t uuid = uuid_t{}, uuid_t parent_uuid = uuid_t{}, uint32_t user_sb_size = 0); Btree(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb); - Btree(BtreeConfig const& cfg, uuid_t uuid = uuid_t{}, uuid_t parent_uuid = uuid_t{}, uint32_t user_sb_size = 0); - Btree(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb); virtual ~Btree(); // Destroy the entire btree from persistent and from memory. It is to be noted that all blocks are not destroyed at @@ -160,108 +146,6 @@ class Btree : public BtreeBase { // expected to call put_range_next() again. Failing to do so will result in memory leak. btree_status_t put_range_next(PutPaginateCookie< K >& cookie); - // @brief Gets the value associated with the specified key from the B-tree. - // - // @param key The key to search for. - // @param out_val A pointer to store the value associated with the key. (Should be non-nullptr) - // - // @return The status of the get operation. - btree_status_t get_one(BtreeKey const& key, BtreeValue* out_val); - // Destroy the entire btree from persistent and from memory. It is to be noted that all blocks are not destroyed at - // one go. For persistent btree, it might be a staged operation on multiple checkpoints. - folly::Future< folly::Unit > destroy() override; - - // @brief Inserts or updates a key-value pair in the B-tree. - // - // This function inserts a new key-value pair or updates an existing key-value pair in the B-tree - // based on the specified put type. Optionally, it can return the existing value and apply a filter - // callback before insertion. - // - // @param key The key to be inserted or updated. - // @param value The value to be associated with the key. - // @param put_type The type of put operation (e.g., insert, update, upsert). - // @param existing_val Optional pointer to store the existing value prior to update if the key already exists. - // @param filter_cb Optional callback function to apply a filter before insertion. If provided, before putting, if - // an existing key-value pair is found, the filter callback is called with the existing key, value and the new - // value. The callback could return "replace" in that case the existing value is replaced with the new value or it - // could return "keep" in that case key is not modified. - // - // @return The status of the put operation. - // - btree_status_t put_one(BtreeKey const& key, BtreeValue const& value, btree_put_type put_type, - BtreeValue* existing_val = nullptr, put_filter_cb_t filter_cb = nullptr); - - // @brief Inserts or updates a range of key-value pairs in the B-tree. - // - // This function inserts a new range of key-value pairs or updates existing key-value pairs in the B-tree - // based on the specified put type. Optionally, it can return the existing value and apply a filter - // callback before insertion. - // - // This is an unique function which can be used for multiple purpose based on the key type. - // - // Interval Key Behavior: - // If the key is an interval key (which means can next_key be obtained by doing prev_key + 1), for example an - // integer keys. If the input range is provided for an interval key, example [1, 50), then it will behave the - // following way - // 1. If the put_type is INSERT and if a specific key in the interval range is not present in the btree, then it - // will insert it. - // - // 2. If the put_type is UPSERT, then it will insert the keys within the range for which there is no entry in the - // btree. However for keys that exist, it will call the filter_cb(key, current_value, new_value) if provided and - // expects the callback to return the decision. The decision could be - // a. replace - replace the existing value with the new value. Note that the new_value will also be added the - // same offset as the key. So if key range is [1. 50) and if the key is 10, then the value will be added at 10th - // of the original value provided (of course the shifting of 10 can be avoided by the caller by supplying a - // BtreeValue override which simply doesn't add) - // - // b. remove - remove the key from the btree and don't add the new value. This feature is useful when we use the - // btree to maintain multiple versions of the key and when we write the new version of the key, we need to - // remove the older versions of the key along with this write operation. - // - // c. keep - keep the existing value as is and don't add the new value. - // - // 3. If the put_type is UPDATE, then it will only act on keys which already exist and the behavior is identical to - // upsert case above when the key is present. - // - // Non-Interval Key Behavior: - // If the key is not an interval key, then only put_type = UPDATE is supported. It will walk through the keys within - // the range and then do a filter_cb(key, current_value, new_value) if provided and expects the callback to return - // the decision. The decision could be - // a. replace - replace the existing value with the new value for that key. - // b. remove - remove the key from the btree and don't update the new value. - // c. keep - keep the existing value as is and don't modify the key to new value. - // In this non-interval key case, the range of keys are all updated with the same value. - // - // About batch size: - // The batch size is the number of keys that will be processed in one go. It will return with btree_status::has_more - // and the caller is expected to call put_range_next() method with the cookie passed to resume the next batch until - // it returns btree_status::success. It is to be noted that, the batch size is a best effort from the btree and at - // any iteration it could put between 1 to batch_size keys (it will at least put one_key and at most batch_size keys - // per iteration). - // - // @param inp_range The range of keys to insert, upsert or update - // @param put_type The type of put operation (e.g., insert, update, upsert). - // @param value The value to be associated with the key. Behavior is different for interval and non-interval keys - // (see above) - // @param batch_size The number of keys to process in one go. Default is to attempt to process all keys in one go. - // Please see the note above about the batch size. - // @param filter_cb Optional callback function to apply a filter before insertion. (See above for details) - // - // @return The status of the put operation and a cookie, if it returns btree_status::has_more, then the caller is - // expected to call put_range_next() - std::pair< btree_status_t, PutPaginateCookie< K > > - put_range(BtreeKeyRange< K >&& inp_range, btree_put_type put_type, BtreeValue const& value, - uint32_t batch_size = std::numeric_limits< uint32_t >::max(), put_filter_cb_t filter_cb = nullptr); - - // @brief Continuation of the put_range call for the next batch of keys. Calling this method without calling - // put_range first returns error. - // - // @param cookie The cookie returned by the put_range call - // - // @return The status of the put operation and a cookie, if it returns btree_status::has_more, then the caller is - // expected to call put_range_next() again. Failing to do so will result in memory leak. - btree_status_t put_range_next(PutPaginateCookie< K >& cookie); - // @brief Gets the value associated with the specified key from the B-tree. // // @param key The key to search for. @@ -314,50 +198,6 @@ class Btree : public BtreeBase { btree_status_t query_next(QueryPaginateCookie< K >& cookie, std::vector< std::pair< K, V > >& out_kvs); - // @brief Gets any one value associated with the given key range. If the key range matches multiple keys, then btree - // will randomly pick one key and return the value associated with it. - // - // @param inp_range The range of keys to search for. - // @param out_key A pointer to store the picked key of the entry found. (Should be non-nullptr) - // @param out_val A pointer to store the value associated with the picked key. (Should be non-nullptr) - // - // @return The status of the get_any operation. - btree_status_t get_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val); - - // @brief Removes the key-value pair associated with the specified key from the B-tree. - // - // @param key The key to remove. - // @param out_val An optional pointer to store the value associated with the key before removal. - // - // @return The status of the remove operation. - btree_status_t remove_one(BtreeKey const& key, BtreeValue* out_val); - - // @brief Removes any one key-value pair associated with the given key range. If the key range matches multiple - // keys, then btree will randomly pick one key and remove the key-value pair associated with it. - // - // @param inp_range The range of keys to search for. - // @param out_key A pointer to store the picked key within the range. (Should be non-nullptr). Valid only if return - // status is btree_status_t::success. - // @param out_val A pointer to store the value associated with the picked key. (Should be non-nullptr) Valid only if - // return status is btree_status_t::success. - // - // @return The status of the remove_any operation. - btree_status_t remove_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val); - - std::pair< btree_status_t, RemovePaginateCookie< K > > - remove_range(BtreeKeyRange< K >&& inp_range, uint32_t batch_size = std::numeric_limits< uint32_t >::max(), - remove_filter_cb_t filter_cb = nullptr); - - btree_status_t remove_range_next(RemovePaginateCookie< K >& cookie); - - std::pair< btree_status_t, QueryPaginateCookie< K > > - query(BtreeKeyRange< K >&& inp_range, std::vector< std::pair< K, V > >& out_kvs, - uint32_t batch_size = std::numeric_limits< uint32_t >::max(), - BtreeQueryType query_type = BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, - get_filter_cb_t filter_cb = nullptr); - - btree_status_t query_next(QueryPaginateCookie< K >& cookie, std::vector< std::pair< K, V > >& out_kvs); - nlohmann::json get_status(int log_level) const; nlohmann::json get_metrics_in_json(bool updated); @@ -368,17 +208,6 @@ class Btree : public BtreeBase { std::string to_digraph_visualize_format() const; - void dump(const std::string& file, std::string format = "string", - BtreeNode::ToStringCallback< K, V > cb = nullptr) const; - - nlohmann::json get_metrics_in_json(bool updated); - - std::string to_string() const; - - std::string to_custom_string(BtreeNode::ToStringCallback< K, V > cb) const; - - std::string to_digraph_visualize_format() const; - void dump(const std::string& file, std::string format = "string", BtreeNode::ToStringCallback< K, V > cb = nullptr) const; @@ -386,12 +215,6 @@ class Btree : public BtreeBase { uint64_t count_keys(bnodeid_t start_bnodeid = empty_bnodeid) const; -private: - /////////////////////////////////// Mutate Impl methods ///////////////////////// - template < typename ReqT > - btree_status_t put(ReqT& put_req); - uint64_t count_keys(bnodeid_t start_bnodeid = empty_bnodeid) const; - private: /////////////////////////////////// Mutate Impl methods ///////////////////////// template < typename ReqT > @@ -417,22 +240,12 @@ class Btree : public BtreeBase { template < typename ReqT > btree_status_t get(ReqT& get_req); - template < typename ReqT > - btree_status_t do_get(const BtreeNodePtr& my_node, ReqT& greq); - K* out_split_key, CPContext* context); - - ///////////////////////////////// Get Impl Methods ///////////////////////////////// - template < typename ReqT > - btree_status_t get(ReqT& get_req); - template < typename ReqT > btree_status_t do_get(const BtreeNodePtr& my_node, ReqT& greq); - ///////////////////////////////// Remove Impl Methods ///////////////////////////////// ///////////////////////////////// Remove Impl Methods ///////////////////////////////// template < typename ReqT > btree_status_t remove(ReqT& rreq); - btree_status_t remove(ReqT& rreq); template < typename ReqT > btree_status_t do_remove(const BtreeNodePtr& my_node, locktype_t curlock, ReqT& rreq); @@ -440,25 +253,18 @@ class Btree : public BtreeBase { template < typename ReqT > btree_status_t check_collapse_root(ReqT& rreq); - template < typename ReqT > - btree_status_t check_collapse_root(ReqT& rreq); - btree_status_t merge_nodes(const BtreeNodePtr& parent_node, const BtreeNodePtr& leftmost_node, uint32_t start_indx, uint32_t end_indx, CPContext* context); - uint32_t end_indx, CPContext* context); - - ///////////////////////////////// Query Impl Methods ///////////////////////////////// - btree_status_t query(BtreeQueryRequest< K >& query_req, std::vector< std::pair< K, V > >& out_values); ///////////////////////////////// Query Impl Methods ///////////////////////////////// btree_status_t query(BtreeQueryRequest< K >& query_req, std::vector< std::pair< K, V > >& out_values); btree_status_t do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values); - std::vector< std::pair< K, V > >& out_values); + btree_status_t do_traversal_query(const BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values); - std::vector< std::pair< K, V > >& out_values); + #ifdef SERIALIZABLE_QUERY_IMPLEMENTATION btree_status_t do_serialzable_query(const BtreeNodePtr& my_node, BtreeSerializableQueryRequest& qreq, std::vector< std::pair< K, V > >& out_values); @@ -492,34 +298,6 @@ class Btree : public BtreeBase { void append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx = 0, uint32_t end_idx = 0) const; -protected: - mutable iomgr::FiberManagerLib::shared_mutex m_btree_lock; - std::atomic< bool > m_destroyed{false}; -private: - /////////////////////////////// Internal Node Management Methods //////////////////////////////////// - // BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, - // BtreeNode::Allocator::Token token) const override; - virtual BtreeNodePtr new_node(bnodeid_t id, bool is_leaf, BtreeNode::Allocator::Token token) const override; - virtual BtreeNodePtr load_node(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) const override; - - /////////////////////////////////// Helper Methods /////////////////////////////////////// - btree_status_t post_order_traversal(locktype_t acq_lock, const auto& cb); - btree_status_t post_order_traversal(const BtreeNodePtr& node, locktype_t acq_lock, const auto& cb); - void get_all_kvs(std::vector< std::pair< K, V > >& kvs) const; - uint64_t get_btree_node_cnt() const; - uint64_t get_child_node_cnt(bnodeid_t bnodeid) const; - void to_string_internal(bnodeid_t bnodeid, std::string& buf) const; - void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, - BtreeNode::ToStringCallback< K, V > const& cb) const; - void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, - std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; - void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; - void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; - void print_node(const bnodeid_t& bnodeid) const; - - void append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx = 0, - uint32_t end_idx = 0) const; - protected: mutable iomgr::FiberManagerLib::shared_mutex m_btree_lock; std::atomic< bool > m_destroyed{false}; diff --git a/src/include/homestore/btree/btree_base.hpp b/src/include/homestore/btree/btree_base.hpp index 95b0ee043..937a5ed46 100644 --- a/src/include/homestore/btree/btree_base.hpp +++ b/src/include/homestore/btree/btree_base.hpp @@ -27,7 +27,7 @@ class UnderlyingBtree { // Btree based implementations superblock area struct BtreeSuperBlock { static constexpr size_t underlying_btree_sb_size = - IndexSuperBlock::index_impl_sb_size - sizeof(bnodeid_t) - sizeof(uint32_t); + IndexSuperBlock::index_impl_sb_size - sizeof(bnodeid_t) - sizeof(uint64_t) - sizeof(uint32_t); bnodeid_t root_node_id{empty_bnodeid}; // Btree Root Node ID uint64_t root_link_version{0}; diff --git a/src/include/homestore/btree/node_variant/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp index 21b1830a4..a9890cff8 100644 --- a/src/include/homestore/btree/node_variant/prefix_node.hpp +++ b/src/include/homestore/btree/node_variant/prefix_node.hpp @@ -88,21 +88,6 @@ class FixedPrefixNode : public VariantNode< K, V > { } } - int compare(BtreeKey const& key, BtreeValue const& val) const { - if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { - sisl::blob const kblob = s_cast< K const& >(key).serialize_prefix(); - sisl::blob const vblob = s_cast< V const& >(val).serialize_prefix(); - DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Prefix key size mismatch with serialized prefix size"); - DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Prefix value size mismatch with serialized prefix size"); - uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(prefix_entry); - int cmp = std::memcmp(cur_ptr, kblob.cbytes(), kblob.size()); - if (cmp) { return cmp; } - cmp = std::memcmp(cur_ptr + kblob.size(), vblob.cbytes(), vblob.size()); - return cmp; - } - return 0; - } - sisl::blob key_buf() const { return sisl::blob{r_cast< uint8_t const* >(this) + sizeof(prefix_entry), key_size()}; } @@ -179,10 +164,7 @@ class FixedPrefixNode : public VariantNode< K, V > { } virtual ~FixedPrefixNode() = default; - virtual void on_update_phys_buf() override { - // Update the prefix bitset with the new buffer - prefix_bitset_ = sisl::CompactBitSet{sisl::blob{bitset_area(), prefix_bitset_.size() / 8}, false}; - } + ///////////////////////////// All overrides of BtreeIntervalNode /////////////////////////////////// /// @brief Upserts a batch of entries into a prefix node. /// @@ -322,6 +304,7 @@ class FixedPrefixNode : public VariantNode< K, V > { } } if (num_removed) { this->inc_gen(); } + #ifndef NDEBUG validate_sanity(); #endif @@ -361,8 +344,6 @@ class FixedPrefixNode : public VariantNode< K, V > { return get_prefix_entry_c(get_suffix_entry_c(idx)->prefix_slot)->ref_count; } - uint32_t compact_saving() const { return num_prefix_holes() * prefix_entry::size(); } - uint32_t available_size() const override { auto num_holes = num_prefix_holes(); if (num_holes > prefix_node_header::min_holes_to_compact) { @@ -372,18 +353,7 @@ class FixedPrefixNode : public VariantNode< K, V > { } } - uint32_t occupied_size() const override { - return (this->node_data_size() - sizeof(prefix_node_header) - (prefix_bitset_.size() / 8) - - this->available_size()); - } - - bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { -#ifdef _PRERELEASE - auto max_keys = this->max_keys_in_node(); - if (max_keys && this->total_entries() > max_keys) { return false; } -#endif - return has_room(1u); - } + bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { return has_room(1u); } uint32_t get_nth_key_size(uint32_t) const override { return dummy_key< K >.serialized_size(); } @@ -585,17 +555,16 @@ class FixedPrefixNode : public VariantNode< K, V > { } return num_entries; } - - uint32_t copy_by_size(BtreeNode const& o, uint32_t start_idx, uint32_t size) { - return copy_internal(o, start_idx, true /* by_size*/, size); - } - uint32_t copy_by_entries(BtreeNode const& o, uint32_t start_idx, uint32_t nentries) { return copy_internal(o, start_idx, false /* by_size*/, nentries); } #endif + uint32_t copy_by_size(BtreeNode const& o, uint32_t start_idx, uint32_t size) { + return copy_internal(o, start_idx, true /* by_size*/, size); + } + uint32_t get_entries_size(uint32_t start_idx, uint32_t end_idx) const override { return (prefix_entry::size() + suffix_entry::size()) * (end_idx - start_idx); } @@ -628,14 +597,6 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t copy_internal(BtreeNode const& o, uint32_t start_idx, bool by_size, uint32_t limit) { FixedPrefixNode const& src_node = s_cast< FixedPrefixNode const& >(o); -#ifdef _PRERELEASE - if (by_size) { - const uint32_t max_keys = this->max_keys_in_node(); - if (max_keys) { - if (this->total_entries() + limit > max_keys) { limit = max_keys - this->total_entries(); } - } - } -#endif // Adjust the size_to_move to cover the new node's reqd header space. uint32_t copied_size{0}; @@ -704,11 +665,10 @@ class FixedPrefixNode : public VariantNode< K, V > { } std::string to_string(bool print_friendly = false) const override { - auto str = - fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} occupied_size={} ", - (print_friendly ? "------------------------------------------------------------\n" : ""), - this->node_id(), this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), - this->next_bnode(), this->available_size(), this->occupied_size()); + auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} ", + (print_friendly ? "------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), + (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode(), this->available_size()); if (!this->is_leaf() && (this->has_valid_edge())) { fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -751,7 +711,8 @@ class FixedPrefixNode : public VariantNode< K, V > { auto phdr = prefix_header(); ++phdr->used_slots; - if (slot_num + 1u > phdr->tail_slot) { phdr->tail_slot = slot_num + 1u; } + if (s_cast< uint16_t >(slot_num) >= phdr->tail_slot) { phdr->tail_slot = slot_num + 1; } + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", slot_num, phdr->tail_slot); return slot_num; @@ -768,9 +729,9 @@ class FixedPrefixNode : public VariantNode< K, V > { if (--pentry->ref_count == 0) { --phdr->used_slots; prefix_bitset_.reset_bit(sisl::blob{bitset_area(), uint32_cast(bitset_size())}, slot_num); - if ((slot_num != 0) && (slot_num == phdr->tail_slot)) { + if ((slot_num == phdr->tail_slot - 1)) { uint16_t prev_slot = prefix_bitset_.get_prev_set_bit(cbitset_blob(), slot_num); - if (prev_slot != std::numeric_limits< uint16_t >::max()) { phdr->tail_slot = prev_slot; } + phdr->tail_slot = prev_slot + 1u; } } } @@ -784,10 +745,11 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t available_size_without_compaction() const { uint8_t const* suffix = r_cast< uint8_t const* >(get_suffix_entry_c(this->total_entries())); - uint8_t const* prefix = r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)); + uint8_t const* prefix = + r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)) + prefix_entry::size(); - if (suffix <= prefix + prefix_entry::size()) { - return prefix - suffix + prefix_entry::size(); + if (suffix <= prefix) { + return prefix - suffix; } else { DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area {}", int64_t(suffix - prefix)); @@ -795,7 +757,9 @@ class FixedPrefixNode : public VariantNode< K, V > { } } - uint32_t available_size_with_compaction() const { return available_size_without_compaction() + compact_saving(); } + uint32_t available_size_with_compaction() const { + return available_size_without_compaction() + (num_prefix_holes() * prefix_entry::size()); + } bool has_room(uint16_t for_nentries) const { return (available_size_without_compaction() >= (prefix_entry::size() + (for_nentries * suffix_entry::size()))); @@ -807,8 +771,7 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t num_prefix_holes() const { auto phdr = cprefix_header(); - DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", - phdr->used_slots, phdr->tail_slot); + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number is not less than tail slot number"); return (phdr->tail_slot - phdr->used_slots); } @@ -852,7 +815,7 @@ class FixedPrefixNode : public VariantNode< K, V > { // Finally adjust the tail offset to the compacted area. auto phdr = prefix_header(); phdr->tail_slot = phdr->used_slots; - DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(0u), + DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(cbitset_blob(), 0u), "Tail slot is not equal to the next reset bit, not expected"); DEBUG_ASSERT_EQ(this->num_prefix_holes(), 0, "Shouldn't be any hole after compression, not expected"); } @@ -897,7 +860,7 @@ class FixedPrefixNode : public VariantNode< K, V > { prefix_entry* get_prefix_entry(uint16_t slot_num) { return r_cast< prefix_entry* >( this->node_data_area() + - (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); + (this->node_data_size() - (s_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } prefix_entry const* get_prefix_entry_c(uint16_t slot_num) const { @@ -915,39 +878,5 @@ class FixedPrefixNode : public VariantNode< K, V > { static constexpr uint32_t get_key_size() { return prefix_entry::key_size() + suffix_entry::key_size(); } static constexpr uint32_t get_value_size() { return prefix_entry::value_size() + suffix_entry::value_size(); } - - std::string compact_bitset() const { - auto x = prefix_bitset_.to_string(); - std::ostringstream result; - std::vector< size_t > indices; - for (size_t i = 0; i < x.size(); ++i) { - if (x[i] == '1') { indices.push_back(i); } - } - - if (indices.empty()) { return result.str(); } - - size_t start = indices[0]; - size_t end = start; - result << "size = " << indices.size() << " : "; - for (size_t i = 1; i < indices.size(); ++i) { - if (indices[i] == end + 1) { - end = indices[i]; - } else { - if (start == end) { - result << start << ", "; - } else { - result << start << "-" << end << ", "; - } - start = end = indices[i]; - } - } - if (start == end) { - result << start; - } else { - result << start << "-" << end; - } - - return result.str(); - } }; } // namespace homestore diff --git a/src/include/homestore/btree/node_variant/variant_node.hpp b/src/include/homestore/btree/node_variant/variant_node.hpp index 77ae054bc..332402b5a 100644 --- a/src/include/homestore/btree/node_variant/variant_node.hpp +++ b/src/include/homestore/btree/node_variant/variant_node.hpp @@ -311,6 +311,5 @@ class VariantNode : public BtreeNode { } return ret; } - virtual void on_update_phys_buf() override {}; }; } // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index 099cef8ac..541fdf80d 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -131,7 +131,9 @@ class HomeStore { std::unique_ptr< MetaBlkService > m_meta_service; std::unique_ptr< LogStoreService > m_log_service; std::unique_ptr< IndexService > m_index_service; +#ifdef REPLICATION_SUPPORT std::shared_ptr< ReplicationService > m_repl_service; +#endif std::unique_ptr< DeviceManager > m_dev_mgr; shared< sisl::logging::logger_t > m_periodic_logger; @@ -163,8 +165,10 @@ class HomeStore { HomeStore& with_log_service(); HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, std::vector< ServiceSubType > sub_types); +#ifdef REPLICATION_SUPPORT HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector = nullptr); +#endif bool start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb = nullptr); void format_and_start(std::map< ServiceId, hs_format_params >&& format_opts); @@ -189,7 +193,9 @@ class HomeStore { if (!m_index_service) { throw std::runtime_error("index_service is nullptr"); } return *m_index_service; } +#ifdef REPLICATION_SUPPORT ReplicationService& repl_service() { return *m_repl_service; } +#endif DeviceManager* device_mgr() { return m_dev_mgr.get(); } ResourceMgr& resource_mgr() { return *m_resource_mgr.get(); } CPManager& cp_mgr() { return *m_cp_mgr.get(); } diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index ea7c156a9..45e2488c6 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -559,6 +559,7 @@ class ReplDev { // we have no shutdown for repl_dev, since shutdown repl_dev is done by repl_service void stop() { +#if 0 start_stopping(); while (true) { auto pending_request_num = get_pending_request_num(); @@ -566,6 +567,7 @@ class ReplDev { std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } +#endif } // complete all the requests that are in progress and start refusing new reqs diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 579930a63..b17fc0a61 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -192,34 +192,25 @@ folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& blkids, bool part_of_batch) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); for (const auto& blkid : blkids) { s_futs.emplace_back(async_write(sgs, blkid, part_of_batch)); } - decr_pending_request_num(); return collect_all_futures(s_futs); } void BlkDataService::submit_io_batch() { m_vdev->submit_batch(); } BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { - if (is_stopping()) return BlkAllocStatus::FAILED; - incr_pending_request_num(); HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); - auto ret = m_vdev->alloc_blks(nblks, hints, out_blkids); - decr_pending_request_num(); - return ret; + return m_vdev->alloc_blks(nblks, hints, out_blkids); } BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkids) { - if (is_stopping()) return BlkAllocStatus::FAILED; - incr_pending_request_num(); HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); @@ -265,18 +256,7 @@ void BlkDataService::start() { std::move(std::make_unique< DataSvcCPCallbacks >(m_vdev))); } -void BlkDataService::stop() { - start_stopping(); - // we have no way to track the completion of each async io in detail which should be done in iomanager level, so - // we just wait for 3 seconds, and we expect each io will be completed within this time. - - // TODO: find a better solution to track the completion of these aysnc calls - std::this_thread::sleep_for(std::chrono::milliseconds(3000)); - while (true) { - if (!get_pending_request_num()) break; - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } -} +void BlkDataService::stop() {} uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp index 173b8e0a1..8440d6f68 100644 --- a/src/lib/common/resource_mgr.cpp +++ b/src/lib/common/resource_mgr.cpp @@ -15,13 +15,15 @@ *********************************************************************************/ #include #include -#include #include #include #include "resource_mgr.hpp" #include "homestore_assert.hpp" -#include "replication/repl_dev/raft_repl_dev.h" +#ifdef REPLICATION_SUPPORT +#include +#include "replication/repl_dev/raft_repl_dev.h" +#endif namespace homestore { ResourceMgr& resource_mgr() { return hs()->resource_mgr(); } diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index 5c3e5b34f..6ca2678fc 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -24,8 +24,6 @@ #include #include #include -#include -#include "replication/repl_dev/raft_repl_dev.h" #include "device/chunk.h" #include "device/device.h" #include "device/physical_dev.hpp" diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 403bad2f4..f7e4f9019 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -40,7 +40,9 @@ #include "device/virtual_dev.hpp" #include "common/resource_mgr.hpp" #include "meta/meta_sb.hpp" +#ifdef REPLICATION_SUPPORT #include "replication/service/generic_repl_svc.h" +#endif #include "common/crash_simulator.hpp" /* @@ -94,6 +96,7 @@ HomeStore& HomeStore::with_log_service() { return *this; } +#ifdef REPLICATION_SUPPORT HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector) { m_services[uint32_cast(ServiceType::REPLICATION)] = std::vector< ServiceSubType >{1u, ServiceSubType::DEFAULT}; @@ -103,6 +106,7 @@ HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_ap s_custom_chunk_selector = std::move(custom_chunk_selector); return *this; } +#endif #ifdef _PRERELEASE HomeStore& HomeStore::with_crash_simulator(std::function< void(void) > cb) { @@ -192,7 +196,9 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ if (has_repl_data_service()) { m_log_service = std::make_unique< LogStoreService >(); m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); +#ifdef REPLICATION_SUPPORT m_repl_service = GenericReplService::create(std::move(s_repl_app)); +#endif } else { if (has_log_service()) { m_log_service = std::make_unique< LogStoreService >(); } if (has_data_service()) { @@ -300,7 +306,9 @@ void HomeStore::do_start() { if (has_index_service()) { m_index_service->start(); } if (has_repl_data_service()) { +#ifdef REPLICATION_SUPPORT s_cast< GenericReplService* >(m_repl_service.get())->start(); // Replservice starts logstore & data service +#endif } else { if (has_data_service()) { m_data_service->start(); } if (has_log_service() && inp_params.auto_recovery) { @@ -338,11 +346,13 @@ void HomeStore::shutdown() { m_resource_mgr->stop(); if (has_repl_data_service()) { +#ifdef REPLICATION_SUPPORT // Log and Data services are stopped by repl service s_cast< GenericReplService* >(m_repl_service.get())->stop(); m_log_service.reset(); m_data_service.reset(); m_repl_service.reset(); +#endif } else { if (has_log_service()) { m_log_service->stop(); diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 1fa262dc7..2b3f88c30 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -146,6 +146,7 @@ void LogDev::stop() { m_hs.reset(); } +#if 0 void LogDev::stop() { start_stopping(); while (true) { @@ -175,6 +176,7 @@ void LogDev::stop() { std::move(f).get(); } } +#endif void LogDev::destroy() { THIS_LOGDEV_LOG(INFO, "Logdev destroy metablks log_dev={}", m_logdev_id); diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index 1392a27b7..7270a6184 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -136,8 +136,6 @@ logdev_id_t LogStoreService::get_next_logdev_id() { } logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { - if (is_stopping()) return 0; - incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); logdev_id_t logdev_id = get_next_logdev_id(); auto logdev = create_new_logdev_internal(logdev_id, flush_mode); @@ -148,9 +146,8 @@ logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { } void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { - if (is_stopping()) return; HS_LOG(INFO, logstore, "Destroying logdev {}", logdev_id); - incr_pending_request_num(); + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { @@ -160,20 +157,20 @@ void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { // Stop the logdev and release all the chunks from the journal vdev. auto& logdev = it->second; - if (!logdev->is_stopped()) { - // Stop the logdev if its started. - logdev->stop(); - } + // if (!logdev->is_stopped()) { + // Stop the logdev if its started. + logdev->stop(); + //} - // First release all chunks. - m_logdev_vdev->destroy(logdev_id); + // First release all chunks. + m_logdev_vdev->destroy(logdev_id); - // Destroy the metablks for logdev. - logdev->destroy(); + // Destroy the metablks for logdev. + logdev->destroy(); - m_id_logdev_map.erase(it); - COUNTER_DECREMENT(m_metrics, logdevs_count, 1); - HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id); + m_id_logdev_map.erase(it); + COUNTER_DECREMENT(m_metrics, logdevs_count, 1); + HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id); } void LogStoreService::delete_unopened_logdevs() { @@ -289,9 +286,8 @@ folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_i } void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { - if (is_stopping()) return; HS_LOG(INFO, logstore, "Removing logstore {} from logdev {}", store_id, logdev_id); - incr_pending_request_num(); + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); @@ -301,7 +297,7 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor } it->second->remove_log_store(store_id); HS_LOG(INFO, logstore, "Successfully removed logstore {} from logdev {}", store_id, logdev_id); - decr_pending_request_num(); + COUNTER_DECREMENT(m_metrics, logstores_count, 1); } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index b2cce85bb..c0f910741 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -404,7 +404,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) { LOGINFO("Hit flip baseline_resync_restart_new_follower crashing"); - hs()->crash_simulator().crash(); + hs()->crash_simulator().crash_now(); } #endif } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index dbf56a3c2..03b540184 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -35,7 +35,7 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } - incr_pending_request_num(); + // incr_pending_request_num(); auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, value.size, m_listener); @@ -79,38 +79,38 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { data_service().commit_blk(blkid); } m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkids(), rreq); - decr_pending_request_num(); + // decr_pending_request_num(); }); } std::error_code SoloReplDev::alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, std::vector< MultiBlkId >& out_blkids) { - if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); } + // if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); } - incr_pending_request_num(); + // incr_pending_request_num(); std::vector< BlkId > blkids; auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints, blkids); if (status != BlkAllocStatus::SUCCESS) { DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); - decr_pending_request_num(); + // decr_pending_request_num(); return std::make_error_code(std::errc::no_space_on_device); } for (auto& blkid : blkids) { out_blkids.emplace_back(blkid); } - decr_pending_request_num(); + // decr_pending_request_num(); return std::error_code{}; } folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< MultiBlkId >& blkids, sisl::sg_list const& value, bool part_of_batch, trace_id_t tid) { - if (is_stopping()) { + /*if (is_stopping()) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } + }*/ - incr_pending_request_num(); + // incr_pending_request_num(); HS_REL_ASSERT_GT(blkids.size(), 0, "Empty blkid vec"); std::vector< folly::Future< std::error_code > > futs; futs.reserve(blkids.size()); @@ -138,15 +138,15 @@ folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< Mul } } - decr_pending_request_num(); + // decr_pending_request_num(); return folly::makeFuture< std::error_code >(std::error_code{}); }); } void SoloReplDev::async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, sisl::blob const& key, uint32_t data_size, repl_req_ptr_t rreq, trace_id_t tid) { - if (is_stopping()) { return; } - incr_pending_request_num(); + // if (is_stopping()) { return; } + // incr_pending_request_num(); // We expect clients to provide valid repl req ctx with blocks allocated. HS_REL_ASSERT(rreq, "Invalid repl req ctx"); @@ -198,22 +198,22 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch, trace_id_t tid) { - if (is_stopping()) { + /*if (is_stopping()) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } - incr_pending_request_num(); + }*/ + // incr_pending_request_num(); auto result = data_service().async_read(bid, sgs, size, part_of_batch); - decr_pending_request_num(); + // decr_pending_request_num(); return result; } folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { - if (is_stopping()) { + /*if (is_stopping()) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } - incr_pending_request_num(); + }*/ + // incr_pending_request_num(); auto result = data_service().async_free_blk(bid); - decr_pending_request_num(); + // decr_pending_request_num(); return result; } diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 7d226c016..6f3861d59 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -81,7 +81,6 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService(){}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -100,12 +99,12 @@ void SoloReplService::start() { } void SoloReplService::stop() { - start_stopping(); + /*start_stopping(); while (true) { auto pending_request_num = get_pending_request_num(); if (!pending_request_num) break; std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } + }*/ // stop all repl_devs { @@ -129,7 +128,7 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t auto listener = m_repl_app->create_repl_dev_listener(group_id); listener->set_repl_dev(rdev); rdev->attach_listener(std::move(listener)); - incr_pending_request_num(); + // incr_pending_request_num(); { std::unique_lock lg(m_rd_map_mtx); @@ -137,12 +136,12 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t if (!happened) { // We should never reach here, as we have failed to emplace in map, but couldn't find entry DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); } } - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_success< shared< ReplDev > >(rdev); } diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 3cb0ad910..8df5d5e6a 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -197,12 +197,14 @@ void RaftReplService::start() { } void RaftReplService::stop() { +#if 0 start_stopping(); while (true) { auto pending_request_num = get_pending_request_num(); if (!pending_request_num) break; std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } +#endif // stop all repl_devs { @@ -409,7 +411,7 @@ folly::SemiFuture< ReplServiceError > RaftReplService::remove_repl_dev(group_id_ auto ret = std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); - decr_pending_request_num(); + // decr_pending_request_num(); return ret; } @@ -466,8 +468,8 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { - if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); - incr_pending_request_num(); + // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + // incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } @@ -476,21 +478,21 @@ AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const rep .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_error<>(e.error()); } - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_success<>(); }); } AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { - if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); - incr_pending_request_num(); + // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + // incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) @@ -586,13 +588,13 @@ void RaftReplService::gc_repl_reqs() { } void RaftReplService::gc_repl_devs() { - incr_pending_request_num(); + /* incr_pending_request_num(); // Skip gc when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops. if (is_stopping()) { LOGINFOMOD(replication, "ReplSvc is stopping, skipping GC"); decr_pending_request_num(); return; - } + } */ std::vector< group_id_t > groups_to_leave; { @@ -622,7 +624,7 @@ void RaftReplService::gc_repl_devs() { m_rd_map.erase(group_id); } } - decr_pending_request_num(); + // decr_pending_request_num(); } void RaftReplService::flush_durable_commit_lsn() { diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 3fbf5c544..dece4b36e 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -102,6 +102,41 @@ if (${io_tests}) target_link_libraries(test_cp_mgr homestore ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME CPMgr COMMAND test_cp_mgr) + can_build_epoll_io_tests(epoll_tests) + if(${epoll_tests}) + add_test(NAME LogDev-Epoll COMMAND test_log_dev) + add_test(NAME LogStore-Epoll COMMAND test_log_store) + add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) + add_test(NAME DataService-Epoll COMMAND test_data_service) + endif() + + can_build_spdk_io_tests(spdk_tests) + if(${spdk_tests}) + add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true") + add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true") + add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true") + add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true") + if(${epoll_tests}) + SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) + SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) + endif() + endif() +endif() + +can_build_repl_tests(repl_tests) +if (${repl_tests}) + add_executable(test_repl_service) + target_sources(test_repl_service PRIVATE test_repl_service.cpp) + target_link_libraries(test_repl_service homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_repl_log_store) + target_sources(test_repl_log_store PRIVATE test_repl_log_store.cpp) + target_link_libraries(test_repl_log_store hs_logdev homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_repl_data_service) + target_sources(test_repl_data_service PRIVATE test_repl_data_service.cpp) + target_link_libraries(test_repl_data_service homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_solo_repl_dev) target_sources(test_solo_repl_dev PRIVATE test_solo_repl_dev.cpp) target_link_libraries(test_solo_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) @@ -120,30 +155,18 @@ if (${io_tests}) can_build_epoll_io_tests(epoll_tests) if(${epoll_tests}) - add_test(NAME LogDev-Epoll COMMAND test_log_dev) - add_test(NAME LogStore-Epoll COMMAND test_log_store) - add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore) - add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) - add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) + add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore) endif() can_build_spdk_io_tests(spdk_tests) if(${spdk_tests}) - add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true") - add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true") - add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true") - add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true") - add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") - add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") - # add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") - if(${epoll_tests}) - SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) - SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) - endif() + add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true" --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) + add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") + add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") endif() endif() diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 9d883fd50..8698f5100 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -370,14 +370,10 @@ TYPED_TEST(NodeTest, SimpleInsert) { for (uint32_t i = 10; i <= 20; ++i) { this->remove(i); } - this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, 20); - this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, 0, std::numeric_limits< uint32_t >::max()); -} - -TYPED_TEST(NodeTest, RangeChangeInsert) { - if (this->m_node1->get_node_type() != btree_node_type::PREFIX) {return;} - this->put_range(0xFFFFFFFF - 10,20); - this->print(); + this->m_node1->move_out_to_right_by_entries(*this->m_node2, 20); + uint32_t copy_idx{0u}; + this->m_node1->append_copy_in_upto_size(*this->m_node2, copy_idx, std::numeric_limits< uint32_t >::max(), + /*copy_only_if_fits=*/false); } TYPED_TEST(NodeTest, ReverseInsert) { diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 5391e1685..404ba8247 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -472,7 +472,9 @@ class HSTestHelper { } else if ((svc == ServiceType::LOG)) { hsi->with_log_service(); } else if (svc == ServiceType::REPLICATION) { +#ifdef REPLICATION_SUPPORT hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector); +#endif } } #ifdef _PRERELEASE

zlLe#a`*Z_(ihmrGp7uqu3TdP`(K zW|_s)%Gm5EW0dsc+~qZ9KB4czl2q^+k4>I)SsBW&@dPN2^&F170R#86&%tt+3>9*PGE(Qg-q|V#|Opc;B`uEen-jP|UoZ>wcb^g8?)+ zxuIEcjw@pQ3u3?%nF!ggOEmW@AGpO_74`e-N!3t`EE^Snw-rOpdQNf;8Z`_5C^XU~ zZRXx_*(-@NDUr6UcGxc)98cI4?_1dpg1l(%1#_ylaH`cR5IfJgLv)|;M%yqEy~Vb( zi{jPiSL*2zy$zb@C^f5#V<`N{nSoh6PR>9U-`Ybtm}2!V)uvk7D^;gPnq<6<8=?6k7Ct1-I~NM>N?I&B|} zYWm2z{;EvE-dgA5P#)Fu^+vj|Q_=O;twHO`x?5{i1aEuhAiCa zEpv+wj&W{n(6YqmT2RH%#0_ZTuF194@6!pOqffEgs!i?uzJf^ zZC1k#>Ak~^Lb#^W8CI~yVsa8_K-GeRu?D4wdF#l!F&Dg2JuL|xUe_|&A5wo$h=IoR z3qzKxJnXIv^CkAhQ;)YCVb;bAxRkurJI3_SPR(8loXQG7W^4pLu4RyD+CJ#B;iIc( z4cEe#Z5voNyiJ~!l}Q|O(<=9UKyP|7OOC`FCQi!r#)rQ%zI^O|F&O=okowVvhkQU# z8ufG38&+PskhQ}1;zq3Q4{4T|@P5qQRfarodJyjPqFmyP)_sYXFLysBB-M3hVjHNh z0?U^d{bO=L)9ms7o9lGG!`_!lwM%RiCO8fgH{Q-@p z@}4PwHBIsXHd?Cd_#pkNE(<+hi;0J~)A@SMize5#R2Utum-3RA|B$(TV^9u%TPBQl ziD?winsEDL0gb!hLH1uMT}WI+x-^o1Wd6|d6+LF)`N#I!9~XbRi#dGnLi!xv-@G5! zZK#Ghy^h9jGCo<7dZju^*B0^ooje0g-Z(wE{L{yHVFP%3oH>x(_c$UUByFIp7!&8u z01whdO&vY`<1_?~<|>NL_rjhi#qTJ;l=(~IXgHwOK~$-jU5 zO86;~-}L!?M?0=k6Fy{%{*N3Wh1L6~rXdgj>Bhw<<^RRLud~ z5k#1xFSsKu@fKStmPN^BC|Ak#99dgTCF0@KEci}Ww>t>{YJ3qZ%grCSW>uAy04M9} z4h0!F4M>R&$lw^7;Z{nB&O!_^&Ef!}zV=C<%9&|8oBP3uYfaGn-bSn4l2t5?*H z=05{GIoiaSIZJtO;0^h_1#}$cdVh(8L5OaPx!lwnU!ix6epSo#+t-ENU7p=nd5u3- z8oR4nBGHp|Ownl_s&XLFrW+>oo_QW@CGtDQ0?b{cA}XB(&Z#G_d){L$5LN^p5=jw- zJNxI0P=G?cu8n{GE-N=^BDyE1}l zC!fiFq=$JFn_f}L_I|VTDUh9{|D-B4Wr9u4zxCXXkS=64va(515E+<>0XKRR5?rwQ zy*4UDf5t0B2r+|Ts;nu!_9zjCMIn!phdVXll2t!^6;$omE5KrrDe=^cNfWHfAjo?@ z!f-C1xZk}phyG`XNUlaRDBc&WB{ss@?>tywhbPQ8SZGlPVqTgw+?ug_-RA^>Gd|bLh!j|CYVpH z!DhT~=X*4wi6)vSvaOyEVrPNuOMw5?Q_*g5~HU zuW&ZziZbp&QeCTuqSo}<9rKz=i-x4B7ZE%_(nDO}IH9tVa(HYQu2lh$XsVlw>`|Fn z)gyOp1;Lr;QPU3t#r{oyJys3YbcyN-lmFlnPRY|{JNEd^pa6C;^>wdt4WWOj#Csuw z3ZAI2Tk{wv_rDxnT1HNA@QNkT_hHfca=5UqssZ*T9;{>8Fas&0F+1s8&U^4pA>h=q zY4xwgP*C|CB+ojXxLJ{ORpb1a-Vi_l?-@oeJ31Cn)5|J2HCdBi6hs?VyhmVdxtX5> zs8sJMeR_A$F!~bK0YXesH!Tjn%J9m6eKI72fzlJZjy26twA4TTR%Ff#Ncd9?4i8@1H8G2uUHeLr;Irdnn-QU8_nO-0>a_$ zga(F)!f=#CZDRoCGyg^U&iJx9gucfUW|*A8;b`&;`pauoj~SS6IXuhzX*=6MlisaJ zu2~vDS3Ch2XVuIz3*h5r5X*3=>ZLX`i97sape!{9{I!yO&NY7wJ4V|2o?g3a9%!Hh z(D{%2SR%xX+?(ilU7NjY>AjaS#l#js>g@_`&M^7uy#DuJLx_@@5O{`cjITX|kI(ze=_+@-YN}TZ$$qYPEN2xaMFKob_v3h5 z0(ajyb&Yj5&YjnZ3mDF3-LfUR`++83)YHws`>Q<;uTh~j$bTu~{SiwK*H0_3ebf`U z1hAaksNJjD3iIz?a;gUB&SM=)M#+DiabHbEf&`6&zGp)7pYKltE(YYKsyX^6N zcIp-bdi`j@BdP((1Kann=z)dAK7iH;fSxv|?x!vCcpRFKO>pm&=9J#!AH{2;Iis4C zdUqnYS;)XWxdKed6W}-I$!Pj7B^Y>gjv<=w2=(+;B*1I7K!(SUo8rmGS^78V8Yp^S zfMvbo;V>FUFL2KEZYf2;aT+9bBAFFazSpxG@X(iFj9sP{SaN~ejAQFY^xC(84r~n2 z&}--X@KCQ!`fkQj?)S;~3XQUzWKoAxTozXn&-`)K$l(Wdv)ad$_DjH`5Hfb3fj`|C zu#dZSeN+}`1V&Vt1vYiS0)%?mh#eNlBM|Q+(bOy59x<@G<}~Y#5_pHJs~ZVC*&`%k zYUr5@RO&3m-(lha zQl-F}Igrajh646@d@@8BV60)_JtZ*4yX%1rZ`9Ln4TcCSdToodsoBfXqKZK|8(_{8 zd^_y3iE$KL0S6$$i#J`c&~e2*L}%>e@vlg#CZS*QSIjYRLWp^%B`okUrUmm@Ke1zG zea@}nJX6fcR)eE+5l#nTOCbFnM#wwfC`J~v?c-(>1`ctNCxd%s8(=g9(w_fi;kylx zd@}}O73!cuFM|*iTK+>r;zZSfw1gVyj7K-Zu9>hwfdT}pgqFXlPR=}M#=|ztdtwpb zi3W=6TwJ&R70itKE_8Xah_EOqickE523AB4!xm-W#;n(#28(^>Y(cl%U>(kbg{p$H zEOU0VVN{#&#^2D&mO(HcGke?L~wci7AF4PtvK&C@E8`dI(_hq5&w4NPdys-D`T(ia_f83~L|=2Vs>W4Iferlz8tDGZ*2&*_ zA-7$G7|~t;s6w&$ppMXXI(?JrJ_%Mw(hifjUF5UU1Ja z%sp3#?=MD*Az8%r?3!>R8j=GLx?0+O91I|)qW5MER9}4y#k)QVo<`u?O|vI{Chx`N zxjc58@X;DsRJL`c>295OsNF$6kP17vhzL)>f>>ZoJ7x!BF6DprW)+B7f?ohjuR;`F z?O}C$T!XQji&(Sj5x@k{0wX`OK3T|y3NwBqIQ}*%?})nwnBa95vZe_-=YzDTbSgfZ z6e@nZA3f%=Y|LrG`yC9(0U0`Hey*N(v*5Jzo=ZPZH?vzI+?c~E`O5Nv*Nc@NW-fSR z32QCgeT9P0VM!K~dN>15 z$PQ9f`S?1dp5^OgHP9_|y$issC+FWfK`MDc*McV3b9&ozbzKnVu^h2eBw)P%80XPW z#gHNrTnDC-aC?wTR*P}lQr%Y)ZGim0i5WP81qjLb=Rmw;`dj-JJq0}ACZ%oNUQjdH zq|=ZB`otKdlKZ^3thLSpd%g-|NAO~fFozFMHv6mlE_Ycy%%4IY<-GDXt9yIcuNC#} zk*p8c{UCfTvTSMurC`Rbax`E2*D_hYz#KZ+0MYytilWh^OwIW>1ie9-S>V5v&+(5L zt%1yAIq}IFfONWA;6Tx=)4vmY-*O|dQxS5*L-cehmeylXoA}!`*@N4PSY&LolZ0!# zs<(gnIbv01;ztNIKE&I1h7wX|V9tz93GpP7St+7@?EXH2BTHXS5~@Sa;qdRFUKmoxu#$IY#> zDv^;d;oN61`+a`SmJsRIs|v?@iTim2$8q=Fc5>3n3~dV^y_1{}LtTRex*@FEqWn z(-N*&nyay$&!Z}*TQ-v=Gzchno<7Q0A-bdKB}b@R;z zN@&}&2CSR7#2;kGZb)sD>EO+Psj12J6Oo6Un4(80RRTHRohRLDidXw!FeAz0ZNqzi zP#iOn;>ad%L~kCqp!wmWKt)~A*A0#MjT&~SGn@2UrU$2$;S!u5e1{Z+7=w0dC~s|a zH((x#TYbM8HN~bHU&Wa6y=mYH6?(~xZWNHC=`J?6z;)0ta~-k>?bLbyCa)KoPm|^ zWxNjiONq5~OlN0>+y0xdI4OfTwK|RgV!n}PlA#UqqpU_L>=i!#>dfa9vR5+hV%p?h z%X|y-DHKse(5bdIPdut!JL=|2Ha?9g#XQ&n+uQgb50gsl@2mhbV>*urbF`-$Cl9HA zAOE3=)z)B(sh^R~C8w8B$u=_exq-54#cnEz7mAGbCcVs; zHY8=({EZyyZyB#%q73r|_|N?~%JL_6q$xQ2^2A+>Bze~jO*j@I#sE4#Z#d~Ja;wnG zaM>e29)Q;Qc_%-FQ;ENy+&Ec4_UHm4Lcp!O|D%+{LJkbbxQm=}e}>e5`vjXwUjysv z3wlL*<0YRf6C$}hX;0~)ygdXTP$oA#N+34X5$jGQu82)Z62k_$74^vkCK8h$6T_<^ z4F_?RA&N_{h_UYUKQAJ&iIL-#g54%0ifRySK|!FT>cA+lk7s>(|JCH(&gTUkPe;a8 zG_VuWd>P`0-Rs7m+1&p6q95-e-#LA;cZmf;*HFX$_*Ln>BO*TTdy#^E?&RwlevCt= zo{mks(v0C3HD^v+D{d#@TZ$D(JKbQw2rQ-X(a*ihfhvd_45FE7n=lq%E}mrC?yXnv z!AAMc6uE2|@W^rcq3*-x-YkA!@sDnHY`xO(J|eu8*X;>Ie@!*f%#f)$pSIuYMXxM%md=g!n45%6zy>PSC3mR@& z(fd{^C*)tdT@a*spv1;Akd_v5a_d^7VCn-JskZvsUylHnMQ?iJW9S4A)mi4$y!%-= zoiJ?Jz0`S2at<~5fXTM&jzmzMlkik#scs=}>tNG-r#ijEK+fz+o}fHE_VnUA6S3#} zpC0QsQqhOXwcpE`0u|l0a~ljYg>Bcp5GsW`^3$zAQ@6!Q6DPgKqA;?VOB!{Z-}tU2 z^||CPzq5PnOI%D78p+`VdV0-c|IV+30A2P}R~^rMAKF`>+>7hXu2+ogke9n4XyUeU zl2SG8LBlOY=q0rTFm1P#zH()2VY%L`o*RgKFY4GaDM+th*m=>kZ=_hSZh9kZy!QJ{ zw^FBfMkia^QA&1(yd}(wjHVAF&)z%cG+qxR56Wy``X1SSsD{u+DerZ=X)3}4L^3X; zQ^zK}r}+gMiRe%9NM8ZSduTPZ zD2dvYQ~lj%lfqVWoo?<4lB?aHn>)~WOJb4Alh)n8#7Qian~I~t>u z$$>UUh0o0>)TVlLCF@;5mE%4`ayGGp^HS}}ArPNT6+2&saWop;!T$@+k7O2fv+=sZ z5kY>C8NOJ;wuAC8uZ=II%*4dZ@pfYO6_lf1I_I3%bH?6EHy=}ym0spZLDLS_3K@}# zlFplAo0>+wrs)0iYz%(>OWpd++Hqw&8R{2TLVU?OuTYaYwz1VqNo3`p<&vo<8TYlR z=^rUiHEWc;(6gC|WA5u9skZ@GMRbDEBJ`MCK7se7Zybb;)9U<5o3# z-4%0LoksVtRXe@;~w3Pr~RK{ZN`V z4bn3@L}`%LUq?F2j|nwZI8M-ciYGeUl!W|OCB8Q;sqK1CPv4Nj?Wn4|eq-6XxX}|z zc|uN2yT6s4+F2Q~`(i60#+6n>^B7GWemnP~;j!Yd`~D|!4JKh{5RidaX?Q_3(^7kp z9^@-tyN@pZTv##^%->Fr{d%7s(|Uux=FwN*qJ}(E=J*Y%9oLU>b0$G9X!RyW+rYm~rT>!_+ zP#AObbg#9Zo({(qi0HMiD%Jobqh$Z~tX&(8PhnMWLs1dAp=@Ov{Ni}D_vR99!|7*D zad*RuQ}Qki$*Pc&V+Tk zJQ<6?P<{0w>JF%~Hmcp8e%G+(H~!W-lw&;+LC!p_XrgOo?Oz|GoGec%udn$b@V(gY z_4`RZyRtT9x4|sj1N$btBffriF(H`TpC|l=VtOq`!Ags4ouj+S=10U4@y9FlcU01)oT; z$JD>Lu*ys;T+^ubL<@Kv49Si;BBlj=DP8ERw%f;7Vvn<#l5@jWy+9lO721#6()+fl z_&WMn=xGqxYbN-DJi4(A&i#2D-j%U5R{!Dk6|bTgOXwlPxjfFYb)6Xcf`>aDhKNEW zmway7S6cpvGTy4Z&q>GzP&FLdr8pFXQrbEkH6!Rz#~$z8fk57j_5qCI{Xp8YZOhNA z!U@i+cMU-qYK|Z(Ga7S%xc^mQ{5!_IFGX4>x+ToP_y*XRsueV~pGRY_Xat|)hxQYI z%}@?w@x!cR_W0Doc>1fY-`l4$`7tIo_6Ut}Y+y-AWtG_hX^`Q4ukzUj!WdjJVQtJU zjnwg}>p_0shPS`1!*%!Klf~y_gsl<>T-Jx^Z^rmMtK3mR5a~BGFls{2qD0i7n7RE4 zRHp3aZO?ghlL|EH5okvsC^H8j=y4EwH;DrcG#_+@+zKIWn6A`b7CUYT@cdSA*43%MPPQLRm$4y#2&O}3P7yR}HQ%V!nSoo!{TwZs&g~6hLgBnvrjm_k|e+@It3C)+?z&pksLZQwi# zYmon`;Dq$@r)MO)$t)f9`tq`VG*zZ)$O8yQI$7CMC5l$ITj+s)Q5+PG1E9bj9Xid6 zG(oJwUPWg_Oq5ci3;@*x%(>7R5R{3$LVM7VdM>Kj8i$zc2S; zt~YEzCW>Uz5X0+Y6+lY~bl{~S{NK9_DP9-$Rs*6ln6jn1Wx0LZLi;7X$KCf-H}749 zO_fg%!kX&V!tGR8D51E;f)0u3PJ%1f{k-D8j0dO$rXk<1Fe@vCu?tcOq$zQ@`GG+>4sn(k5cZ;L1kCuD_#T5eJAiGcAUy6630_MQ-2N0()*UM