diff --git a/.github/workflows/build_commit.yml b/.github/workflows/build_commit.yml index 0a753fe..7ea2c9f 100644 --- a/.github/workflows/build_commit.yml +++ b/.github/workflows/build_commit.yml @@ -12,75 +12,92 @@ on: malloc-impl: required: true type: string + compiler: + required: false + type: string + default: 'gcc' + conan-channel: + required: false + type: string + default: 'dev' tooling: required: true type: string jobs: SislDeps: - uses: ebay/sisl/.github/workflows/build_dependencies.yml@stable/v13.x + uses: ebay/sisl/.github/workflows/build_dependencies.yml@dev/v14.x with: - branch: stable/v13.x + branch: dev/v14.x platform: ${{ inputs.platform }} build-type: ${{ inputs.build-type }} malloc-impl: ${{ inputs.malloc-impl }} + compiler: ${{ inputs.compiler }} + conan-channel: ${{ inputs.conan-channel }} tooling: None if: ${{ github.event_name != 'pull_request' }} NuraftMesgDeps: needs: SislDeps - uses: eBay/nuraft_mesg/.github/workflows/build_dependencies.yml@stable/v4.x + uses: eBay/nuraft_mesg/.github/workflows/build_dependencies.yml@dev/v5.x with: - branch: stable/v4.x + branch: dev/v5.x platform: ${{ inputs.platform }} build-type: ${{ inputs.build-type }} malloc-impl: ${{ inputs.malloc-impl }} + compiler: ${{ inputs.compiler }} + conan-channel: ${{ inputs.conan-channel }} tooling: None if: ${{ github.event_name != 'pull_request' }} IOMgrDeps: - needs: [SislDeps] - uses: eBay/iomanager/.github/workflows/build_dependencies.yml@stable/v12.x + needs: SislDeps + uses: eBay/iomanager/.github/workflows/build_dependencies.yml@dev/v13.x with: - branch: stable/v12.x + branch: dev/v13.x platform: ${{ inputs.platform }} build-type: ${{ inputs.build-type }} malloc-impl: ${{ inputs.malloc-impl }} - prerelease: "False" + compiler: ${{ inputs.compiler }} + conan-channel: ${{ inputs.conan-channel }} tooling: None if: ${{ github.event_name != 'pull_request' }} - HomeStoreDeps: - needs: [IOMgrDeps, NuraftMesgDeps] - uses: eBay/homestore/.github/workflows/build_dependencies.yml@stable/v7.x + UblkppDeps: + needs: SislDeps + uses: szmyd/ublkpp/.github/workflows/build_dependencies.yml@main with: - branch: stable/v7.x + branch: main platform: ${{ inputs.platform }} build-type: ${{ inputs.build-type }} malloc-impl: ${{ inputs.malloc-impl }} - prerelease: "False" + compiler: ${{ inputs.compiler }} + conan-channel: ${{ inputs.conan-channel }} tooling: None if: ${{ github.event_name != 'pull_request' }} - HomeBlocksDeps: - needs: [HomeStoreDeps] - uses: ./.github/workflows/build_dependencies.yml + HomeStoreDeps: + needs: [IOMgrDeps, NuraftMesgDeps] + uses: eBay/homestore/.github/workflows/build_dependencies.yml@dev/v8.x with: - branch: ${{ github.ref }} + branch: dev/v8.x platform: ${{ inputs.platform }} build-type: ${{ inputs.build-type }} malloc-impl: ${{ inputs.malloc-impl }} - tooling: ${{ inputs.tooling }} - testing: 'True' + compiler: ${{ inputs.compiler }} + conan-channel: ${{ inputs.conan-channel }} + tooling: None if: ${{ github.event_name != 'pull_request' }} - HomeBlocksBuild: + HomeBlocksDeps: + needs: [HomeStoreDeps, UblkppDeps] uses: ./.github/workflows/build_dependencies.yml with: branch: ${{ github.ref }} platform: ${{ inputs.platform }} build-type: ${{ inputs.build-type }} malloc-impl: ${{ inputs.malloc-impl }} + compiler: ${{ inputs.compiler }} + conan-channel: ${{ inputs.conan-channel }} tooling: ${{ inputs.tooling }} testing: 'True' - if: ${{ github.event_name == 'pull_request' }} diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 7091ffb..7e0625e 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -16,6 +16,14 @@ on: malloc-impl: required: true type: string + compiler: + required: false + type: string + default: 'gcc' + conan-channel: + required: false + type: string + default: 'dev' tooling: required: false type: string @@ -50,10 +58,23 @@ on: - libc - tcmalloc - jemalloc + compiler: + required: false + type: choice + options: + - gcc + - clang + default: 'gcc' + conan-channel: + required: false + type: string + default: 'dev' tooling: required: false type: choice - - 'Sanitize' + options: + - 'AddressSanitize' + - 'ThreadSanitize' - 'Coverage' - 'None' default: 'None' @@ -71,119 +92,181 @@ jobs: runs-on: ${{ inputs.platform }} steps: - name: Retrieve Code - uses: actions/checkout@main + uses: actions/checkout@v4 with: ref: ${{ inputs.branch }} if: ${{ inputs.testing == 'True' }} + - name: Check Code Formatting + run: | + # GitHub PR merge commit has shallow history, so fetch the base branch + git fetch --depth=50 origin ${{ github.base_ref }} + + echo "Comparing HEAD against origin/${{ github.base_ref }}" + + # Get C/C++ files changed in this PR + CHANGED_FILES=$(git diff --name-only --diff-filter=d origin/${{ github.base_ref }} | grep -E '\.(c|cpp|cc|cxx|h|hpp|hxx|ipp)$' || true) + + if [[ -z "$CHANGED_FILES" ]]; then + echo "No C/C++ files modified in this PR" + exit 0 + fi + + echo "Checking formatting for PR-modified C/C++ files:" + echo "$CHANGED_FILES" + + # Format only the changed files + echo "$CHANGED_FILES" | xargs clang-format -style=file -i -fallback-style=none + + # Check if formatting changed anything + if ! git diff --exit-code -- $CHANGED_FILES; then + echo "::error::Code formatting check failed. PR-modified files have formatting issues." + echo "::error::Run './apply-clang-format.sh' locally and commit the changes." + exit 1 + fi + + echo "βœ“ All PR-modified files are properly formatted" + if: ${{ github.event_name == 'pull_request' && inputs.testing == 'True' }} + - name: Retrieve Recipe - uses: actions/checkout@main + uses: actions/checkout@v4 with: repository: eBay/HomeBlocks ref: ${{ inputs.branch }} if: ${{ inputs.testing == 'False' }} - name: Setup Conan - uses: ebay/sisl/.github/actions/setup_conan2@stable/v13.x + uses: ebay/sisl/.github/actions/setup_conan2@dev/v14.x with: platform: ${{ inputs.platform }} + compiler: ${{ inputs.compiler }} if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - name: Load HomeBlocks Cache id: restore-cache - uses: eBay/sisl/.github/actions/load_conan2@stable/v13.x + uses: ebay/sisl/.github/actions/load_conan2@dev/v14.x with: testing: ${{ inputs.testing }} key_prefix: HomeBlocksDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} - name: Load Sisl Cache - uses: eBay/sisl/.github/actions/load_conan2@stable/v13.x + uses: eBay/sisl/.github/actions/load_conan2@dev/v14.x with: - load_any: 'True' - key_prefix: SislDeps13-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} + testing: 'False' + path: import/sisl + key_prefix: SislDeps14-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.compiler }} + fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - name: Load IOMgr Cache - uses: eBay/sisl/.github/actions/load_conan2@stable/v13.x - with: - load_any: 'True' - key_prefix: IOMgrDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-False - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - - name: Retrieve Dependencies NuRaftMesg - uses: actions/checkout@main + uses: eBay/sisl/.github/actions/load_conan2@dev/v14.x with: - repository: eBay/nuraft_mesg - path: import/nuraft_mesg - ref: stable/v4.x + testing: 'False' + path: import/iomgr + key_prefix: IOMgrDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} + fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Load NuRaftMesg Cache - uses: eBay/sisl/.github/actions/load_conan2@stable/v13.x + - name: Load NuraftMesg Cache + uses: eBay/sisl/.github/actions/load_conan2@dev/v14.x with: testing: 'False' path: import/nuraft_mesg key_prefix: NuMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} + fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Retrieve Dependencies HomeStore - uses: actions/checkout@main + - name: Load Ublkpp Cache + uses: eBay/sisl/.github/actions/load_conan2@dev/v14.x with: - repository: eBay/HomeStore - path: import/homestore - ref: stable/v7.x + testing: 'False' + path: import/ublkpp + key_prefix: UblkPPDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} + fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - name: Load HomeStore Cache - uses: eBay/sisl/.github/actions/load_conan2@stable/v13.x + uses: eBay/sisl/.github/actions/load_conan2@dev/v14.x with: testing: 'False' path: import/homestore - key_prefix: HomestoreDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-False + key_prefix: HomestoreDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} + fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Retrieve Dependencies + uses: actions/checkout@v4 + with: + repository: szmyd/ublkpp + path: import/ublkpp + ref: main + if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Export Recipes + - name: Retrieve Dependencies + uses: actions/checkout@v4 + with: + repository: eBay/HomeStore + path: import/homestore + ref: dev/v8.x + if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + + - name: Prepare Recipes run: | - sudo apt-get install -y python3-pyelftools libaio-dev - python -m pip install pyelftools - conan export import/homestore - conan export import/nuraft_mesg + # make the following removed directory configurable, for example, using an environment variable. + sudo rm -rf $ANDROID_HOME + sudo rm -rf /usr/lib/llvm* + sudo rm -rf /usr/lib/jvm + sudo rm -rf /usr/lib/google-cloud-sdk + pushd import/ublkpp + ./prepare_v2.sh + conan export --user oss --channel ${{ inputs.conan-channel }} + popd + pushd import/homestore + conan export --user oss --channel ${{ inputs.conan-channel }} + popd if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Create and Test Package + - name: Build Cache run: | - sanitize=$([[ "${{ inputs.tooling }}" == "Sanitize" ]] && echo "True" || echo "False") - conan build \ + conan install \ -o sisl/*:malloc_impl=${{ inputs.malloc-impl }} \ - -o iomgr/*:testing=off \ - -o homestore/*:testing=off \ - -o homeblocks/*:sanitize=${sanitize} \ -s:h build_type=${{ inputs.build-type }} \ - -s:h compiler.cppstd=23 \ + -c tools.build:skip_test=True \ --format=json \ --build missing \ . > ~/build.json - conan list --graph ~/build.json --graph-binaries=build --format=json > ~/pkglist.json - if: ${{ inputs.testing == 'True' && inputs.tooling != 'Coverage' }} + conan list --graph ~/build.json --graph-binaries=build,cache --format=json > ~/pkglist.json + if: ${{ steps.restore-cache.outputs.cache-hit != 'true' }} - name: Save Conan Cache - uses: eBay/sisl/.github/actions/store_conan2@stable/v13.x + uses: eBay/sisl/.github/actions/store_conan2@dev/v14.x with: - key_prefix: HomeBlocksDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} + key_prefix: HomeblocksDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }} if: ${{ github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' && inputs.tooling != 'Coverage'}} + - name: Create and Test Package + run: | + conan cache clean -s -b + if [[ "${{ inputs.tooling }}" == "AddressSanitize" ]]; then sanitize="address" + elif [[ "${{ inputs.tooling }}" == "ThreadSanitize" ]]; then sanitize="thread" + else sanitize="False"; fi + conan create \ + -o sisl/*:malloc_impl=${{ inputs.malloc-impl }} \ + -o homeblocks/*:sanitize=${sanitize} \ + -s:h build_type=${{ inputs.build-type }} \ + ${{ inputs.conan-channel != '' && format('--user oss --channel {0}', inputs.conan-channel) || '' }} \ + . + if: ${{ inputs.testing == 'True' && inputs.tooling != 'Coverage' }} + - name: Code Coverage Run run: | + conan cache clean -s -b python -m pip install gcovr conan build \ -o sisl/*:malloc_impl=${{ inputs.malloc-impl }} \ - -o iomgr/*:testing=off \ - -o homestore/*:testing=off \ -o homeblocks/*:coverage=True \ -s:h build_type=${{ inputs.build-type }} \ - -s:h compiler.cppstd=23 \ - --build missing \ + ${{ inputs.conan-channel != '' && format('--user oss --channel {0}', inputs.conan-channel) || '' }} \ . gcovr --cobertura ./coverage.xml if: ${{ inputs.testing == 'True' && inputs.tooling == 'Coverage' }} @@ -193,4 +276,4 @@ jobs: with: files: ./coverage.xml disable_search: true - if: ${{ inputs.testing == 'True' && inputs.tooling == 'Coverage' }} + if: ${{ inputs.tooling == 'Coverage' }} diff --git a/.github/workflows/conan_build.yml b/.github/workflows/conan_build.yml index e144bb4..835a9b6 100644 --- a/.github/workflows/conan_build.yml +++ b/.github/workflows/conan_build.yml @@ -4,34 +4,47 @@ on: workflow_dispatch: push: branches: - - main + - dev/v6.x pull_request: branches: - - main - + - dev/v6.x jobs: - Build: - strategy: - fail-fast: false - matrix: - platform: ["ubuntu-24.04"] - build-type: ["Debug", "Release"] - malloc-impl: ["libc", "tcmalloc"] - tooling: ["Sanitize", "Coverage", "None"] - exclude: - - build-type: Debug - tooling: None - - build-type: Debug - malloc-impl: tcmalloc - - build-type: Release - malloc-impl: libc - - build-type: Release - tooling: Sanitize - - build-type: Release - tooling: Coverage + GccAddressSanitize: + uses: ./.github/workflows/build_commit.yml + with: + platform: "ubuntu-24.04" + build-type: "Debug" + malloc-impl: "libc" + compiler: "gcc" + conan-channel: "dev" + tooling: "AddressSanitize" + + GccThreadSanitize: + uses: ./.github/workflows/build_commit.yml + with: + platform: "ubuntu-24.04" + build-type: "Debug" + malloc-impl: "libc" + compiler: "gcc" + conan-channel: "dev" + tooling: "ThreadSanitize" + + GccCoverage: + uses: ./.github/workflows/build_commit.yml + with: + platform: "ubuntu-24.04" + build-type: "Debug" + malloc-impl: "libc" + compiler: "gcc" + conan-channel: "dev" + tooling: "Coverage" + + GccRelease: uses: ./.github/workflows/build_commit.yml with: - platform: ${{ matrix.platform }} - build-type: ${{ matrix.build-type }} - malloc-impl: ${{ matrix.malloc-impl }} - tooling: ${{ matrix.tooling }} + platform: "ubuntu-24.04" + build-type: "Release" + malloc-impl: "tcmalloc" + compiler: "gcc" + conan-channel: "dev" + tooling: "None" diff --git a/.github/workflows/version_change_check.yml b/.github/workflows/version_change_check.yml index c83adec..c007a85 100644 --- a/.github/workflows/version_change_check.yml +++ b/.github/workflows/version_change_check.yml @@ -10,12 +10,18 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: fetch-depth: 2 - - name: Check if file is modified + - name: Check if version bump is required run: | - if git diff -r HEAD^1 HEAD -- conanfile.py | egrep "[ ]+version = "; then + changed=$(git diff --name-only HEAD^1 HEAD) + non_test=$(echo "$changed" | grep -Ev '^src/tests/|CMakeLists\.txt$|^\.github/|^\.jenkins|\.md$' || true) + if [ -z "$non_test" ]; then + echo "Only test files changed β€” version bump not required" + exit 0 + fi + if git diff -r HEAD^1 HEAD -- conanfile.py | grep -E "[ ]+version = "; then echo "Version is updated with this PR, OK" else echo "Conan version is not updated with this PR. Please update that to allow PR merge" diff --git a/.jenkins/Jenkinsfile b/.jenkins/Jenkinsfile index d6143dd..3d1f224 100644 --- a/.jenkins/Jenkinsfile +++ b/.jenkins/Jenkinsfile @@ -3,91 +3,63 @@ pipeline { environment { ARTIFACTORY_PASS = credentials('ARTIFACTORY_PASS') - CONAN_USER = 'oss' - TARGET_BRANCH = 'main' - STABLE_BRANCH = 'stable/v*' - failed_pkg = '' - } - - parameters { - string(defaultValue: "", description: 'UpstreamTriggered', name: 'upstream_triggered') } stages { - stage('Adjust Tag for Master/PR') { - when { not { - branch "${STABLE_BRANCH}" - } } + stage('Mission Briefing') { steps { script { - BUILD_MISSING = "--build missing" + PROJECT = sh(script: "grep -m 1 'name =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true) + CONAN_FLAGS = "-s:h compiler.cppstd=23 -c tools.build:skip_test=True --build missing --build '${PROJECT}*'" + VER = sh(script: "grep -m 1 ' version =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true) + SLACK_THREAD = '#sds-ci' } } } - stage('Adjust for Testing/Stable') { - when { - branch "${STABLE_BRANCH}" - } + + stage('Test Flight') { + when { not { branch "stable/v*" } } steps { script { - BUILD_MISSING = "" + CONAN_CHANNEL = sh(script: "echo ${BRANCH_NAME} | sed -E 's,(\\w+-?\\d*)/.*,\\1,' | sed -E 's,-,_,' | tr -d '\n'", returnStdout: true) + CONAN_FLAGS = "${CONAN_FLAGS} --user oss --channel ${CONAN_CHANNEL}" } } } - stage('Get Version') { + stage("Pre-flight") { steps { script { - PROJECT = sh(script: "grep -m 1 'name =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true) - VER = sh(script: "grep -m 1 ' version =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true) - CONAN_CHANNEL = sh(script: "echo ${BRANCH_NAME} | sed -E 's,(\\w+-?\\d*)/.*,\\1,' | sed -E 's,-,_,' | tr -d '\n'", returnStdout: true) - TAG = "${VER}@${CONAN_USER}/${CONAN_CHANNEL}" - CONAN_FLAGS="--name ${PROJECT} --user ${CONAN_USER} --channel ${CONAN_CHANNEL}" - slackSend color: '#0063D1', channel: '#sds-ci', message: "*${PROJECT}/${TAG}* is building." + def slackResponse = slackSend(color: '#0063D1', channel: '#sds-ci', message: "πŸš€ *${PROJECT}* `${VER}` on `${BRANCH_NAME}` β€” build #${BUILD_NUMBER} has begun systems check\nCountdown initiated. Manage your expectations.\n<${BUILD_URL}|Watch nervously>") + SLACK_THREAD = slackResponse?.threadId ?: '#sds-ci' + sh "conan graph info -s:h build_type=Debug ${CONAN_FLAGS} ." + sh "conan graph info -s:h build_type=RelWithDebInfo -o sisl/*:malloc_impl=tcmalloc ${CONAN_FLAGS} ." } } } - stage("Compile") { + stage("Ignition") { steps { - sh "hostname ; \ - echo $NODE_NAME ; \ - conan build -s:h compiler.cppstd=23 ${BUILD_MISSING} -s:h build_type=Debug -o ${PROJECT}/*:sanitize=True ${CONAN_FLAGS} . ; \ - conan create -s:h compiler.cppstd=23 ${BUILD_MISSING} -s:h build_type=Debug ${CONAN_FLAGS} . ; \ - conan create -s:h compiler.cppstd=23 ${BUILD_MISSING} -s:h build_type=RelWithDebInfo -o sisl/*:malloc_impl=tcmalloc ${CONAN_FLAGS} . ; \ - " + script { + slackSend color: '#9B59B6', channel: "${SLACK_THREAD}", message: "β›½ *${PROJECT}* `${BRANCH_NAME}` build #${BUILD_NUMBER} \nAll systems Go...going somewhere anyways...\nLet's light this πŸ•―οΈ... 3... 2... 1...!" + } + sh "conan create -s:h build_type=Debug ${CONAN_FLAGS} ." + script { + slackSend color: '#E67E22', channel: "${SLACK_THREAD}", message: "πŸ§‘β€πŸš€ *${PROJECT}* `${BRANCH_NAME}`\n*Debug: green.* One more build between us and home.\nI've been in here long enough to start naming the compiler warnings. Don't ask.\nRelWithDebInfo... *please don't fail me now.*" + } + sh "conan create -s:h build_type=RelWithDebInfo -o sisl/*:malloc_impl=tcmalloc ${CONAN_FLAGS} ." } - post { - failure { script { sleep 3600000 } } - } } - stage("Deploy") { + stage("Touchdown") { when { expression { !(env.BRANCH_NAME =~ /PR-/) } } steps { sh "conan remote login -p ${ARTIFACTORY_PASS} ebay-local _service_sds" - sh "conan graph info ./ | grep 'ref: ' | awk '{print \$2}' | sort | uniq | grep -v ${PROJECT} | grep -v '#' | while read pkg; do conan upload -r ebay-local -c \"\${pkg}\"; done" - sh "conan upload ${PROJECT}/${TAG} -c -r ebay-local" - } - } - stage("Downstream Build") { - when { allOf { - expression { (env.BRANCH_NAME == "${TARGET_BRANCH}") } - expression { (!"${upstream_triggered}") || ("${upstream_triggered}" == "") } - } } - - stages { - stage('BlockManager') { - steps { - script { - def hblk_res = build job: "BlockMgr/main", parameters: [[$class: 'StringParameterValue', name: 'upstream_triggered', value: 'true']], propagate: true - } - } - post { - failure { script { failed_pkg = "BlockMgr" } } - } + sh "conan upload '*:*' -r ebay-local --confirm" + script { + slackSend color: '#8A6BBF', channel: "${SLACK_THREAD}", message: "πŸ“¦ (`${BRANCH_NAME}`) β€” everyone survived long enough to land in Artifactory\nPackage `${PROJECT}@${VER}` is live and ready to pull downstream" } } } @@ -96,15 +68,16 @@ pipeline { post { failure { script { - if ("${failed_pkg}" != "") { - slackSend color: '#E43237', channel: '#sds-ci', message: "@here HomeBlocks downstream pkg - *${failed_pkg}* build failed.\n*URL:* ${BUILD_URL}" - } else { - slackSend color: '#E43237', channel: '#sds-ci', message: "*${PROJECT}/${TAG}* has had a failure : ${BUILD_URL}" - } + def log = currentBuild.rawBuild.getLog(40) + .collect { it.replaceAll(/\x1B\[[0-9;]*[a-zA-Z]/, '') } + .dropRight(15) + .join('\n') + def fence = '```' + slackSend color: '#E43237', channel: "${SLACK_THREAD}", replyBroadcast: true, message: "πŸ’€ *${PROJECT}* `${VER}` (`${BRANCH_NAME}`) build #${BUILD_NUMBER} didn't make it\nHeld on for _${currentBuild.durationString}_ before calling it quits\n<${BUILD_URL}|Post-mortem>\nπŸͺ¦\n${fence}${log}${fence}" } } success { - slackSend color: '#85B717', channel: '#sds-ci', message: "*${PROJECT}/${TAG}* has completed." + slackSend color: '#85B717', channel: "${SLACK_THREAD}", replyBroadcast: true, message: "βœ… *${PROJECT}* `${VER}` (`${BRANCH_NAME}`) build #${BUILD_NUMBER} β€” it actually worked\nDebug + RelWithDebInfo both green in _${currentBuild.durationString}_\n<${BUILD_URL}|Full report>\n🍾 Mission successful!" } } } diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c72656..02dc151 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,23 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [6.0.0] + +### Changed + +- Migrated the I/O and control paths off Folly futures onto the C++23 stackless-coroutine stack + (HomeStore v8 / iomgr v13 / sisl v14.6); `async_*` operations are now `co_await`-able `sisl::async::task`s. +- Redesigned the public API down to a single installed header, ``: `init_homeblocks()` + takes a `home_blocks_config` value (devices, threads, `on_svc_id` hook) and returns + `result>`; volume I/O is the byte-addressed free functions `async_read` / + `async_write` / `async_unmap(volume_handle, addr, sg_list)` returning `async_result`; error handling + is unified on `std::expected` (`volume_error` for HomeBlocks-specific failures, + `std::errc` otherwise). + +### Removed + +- Folly dependency; the `HomeBlocksApplication` consumer interface (replaced by `home_blocks_config`); and the + heap-allocated per-I/O request object from the public surface. ### Added diff --git a/CMakeLists.txt b/CMakeLists.txt index a926433..f5b219f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,31 +1,28 @@ -cmake_minimum_required (VERSION 3.11) +cmake_minimum_required(VERSION 3.11) project (homeblocks LANGUAGES CXX) +enable_testing() set(CMAKE_CXX_STANDARD 23) -if (NOT DEFINED CMAKE_BUILD_TYPE) - set (CMAKE_BUILD_TYPE "Debug") +if ((DEFINED BUILD_COVERAGE) AND (${BUILD_COVERAGE})) + enable_testing() + include (cmake/coverage.cmake) + APPEND_COVERAGE_COMPILER_FLAGS() + SETUP_TARGET_FOR_COVERAGE_GCOVR_XML(NAME coverage EXECUTABLE ctest DEPENDENCIES ) +elseif ((DEFINED THREAD_SANITIZER_ON) AND (${THREAD_SANITIZER_ON})) + set(SANITIZER_TYPE "thread") + include (cmake/sanitize.cmake) +elseif ((DEFINED ADDRESS_SANITIZER_ON) AND (${ADDRESS_SANITIZER_ON})) + set(SANITIZER_TYPE "address") + include (cmake/sanitize.cmake) endif() - include (cmake/Flags.cmake) -enable_testing() -if ((DEFINED CODE_COVERAGE) AND (${CODE_COVERAGE})) - include (cmake/CodeCoverage.cmake) - APPEND_COVERAGE_COMPILER_FLAGS() -elseif ((DEFINED MEMORY_SANITIZER_ON) AND (${MEMORY_SANITIZER_ON})) - message(WARNING "********* Running with Memory Sanitizer *********") - add_flags("-fsanitize=address \ - -fsanitize=undefined \ - -fsanitize-address-use-after-scope \ - -fno-sanitize=alignment \ - -DCDS_ADDRESS_SANITIZER_ENABLED \ - -fno-omit-frame-pointer \ - -fno-optimize-sibling-calls " - ) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -fsanitize=undefined") -endif() find_package(GTest QUIET REQUIRED) +if (CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif () # set btree type flag if ((DEFINED USE_FIXED_INDEX) AND (${USE_FIXED_INDEX})) @@ -36,11 +33,6 @@ else() add_definitions(-DUSE_FIXED_INDEX=0) endif() -find_program(CCACHE_FOUND ccache QUIET) -if (CCACHE_FOUND) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) -endif () # add component version information add_flags("-DPACKAGE_NAME=${PROJECT_NAME}") diff --git a/README.md b/README.md index aa49532..9157126 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,322 @@ # HomeBlocks -## Build -To build (assuming a recent version of conan package manager is installed) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) + +> A crash-consistent block-volume store built on [HomeStore](https://github.com/eBay/HomeStore) β€” thin-provisioned +> volumes with a replicated, checkpointed data path, exposed through a small C++23 coroutine API. + +HomeBlocks turns a set of raw devices into named **volumes**: each is a sparse, block-addressable store with its +own `LBA β†’ block` index, per-block CRC, and a write-ahead journal, all riding HomeStore's replication, indexing, +and checkpoint machinery. The public surface is one header and a handful of coroutine entry points. + +## πŸš€ Features + +- **Volumes** β€” create / destroy / look up thin-provisioned block volumes; survive process restarts and crashes + via journal replay and a destroy-resume path. +- **Coroutine I/O** β€” byte-addressed `async_read` / `async_write` / `async_unmap` return lazy + [stdexec](https://github.com/NVIDIA/stdexec) coroutines (`sisl::async::task`); `co_await` one, or fan out a + batch with `when_all`. No callbacks, no futures. +- **One error type** β€” every fallible call returns `std::expected`; domain failures use + a small `volume_error` enum, everything else maps to `std::errc`. No exceptions on the I/O path. +- **Opaque handles + factory init** β€” consumers never construct an implementation type; they hold a + `volume_handle` and a `std::shared_ptr` produced by `init_homeblocks()`. +- **Minimal surface** β€” the entire public API is a single installed header, ``. +- **Replicated & checkpointed** β€” data, index, and journal flow through HomeStore's repl-dev / index / CP + services; HomeBlocks owns volume lifecycle and recovery on top. + +## πŸ“‹ Table of Contents + +- [Quick Start](#-quick-start) +- [Architecture](#%EF%B8%8F-architecture) +- [Using HomeBlocks](#-using-homeblocks) +- [Development](#%EF%B8%8F-development) +- [Testing](#-testing) + - [Exercising a volume as a block device (ublk)](#exercising-a-volume-as-a-real-block-device-ublk) +- [Dependencies](#-dependencies) +- [License](#-license) + +## πŸƒ Quick Start + +### Prerequisites + +- Linux (io_uring-capable kernel) +- Conan 2.x +- CMake 3.22+ +- A C++23 compiler (GCC 13+, Clang 17+) + +### Build + +```bash +git clone https://github.com/eBay/HomeBlocks +cd HomeBlocks +conan build . -s build_type=Debug --build missing +``` + +This configures, builds the `homeblocks` library, and runs the unit tests. Artifacts land under +`build/Debug/`. + +### Build Options + +```bash +# Release +conan build . -s build_type=Release --build missing + +# Coverage report (build/Coverage/) +conan build . -o "homeblocks/*:coverage=True" --build missing + +# Address / thread sanitizer (build/Sanitized/) +conan build . -o "homeblocks/*:sanitize=True" --build missing + +# Index layout: fixed (default) vs prefix-compressed btree +conan build . -o "homeblocks/*:fixed_index=False" --build missing +``` + +## πŸ—οΈ Architecture + +HomeBlocks is the volume layer. It owns volume identity, the per-volume `LBA β†’ block` index, checksums, the +write journal, and crash recovery; HomeStore underneath provides replication, the index/data services, and +checkpointing; iomgr provides the io_uring reactor model. + +```mermaid +graph TD + app["Consumer
(e.g. a CSI / target daemon)"] + subgraph hb["HomeBlocks"] + hbi["home_blocks
volume control plane + recovery"] + vol["volume
index (LBA→blk) · CRC · journal"] + io["async_read / async_write / async_unmap"] + end + subgraph hs["HomeStore"] + repl["repl_dev / replication"] + idx["index service"] + data["data service"] + cp["checkpoint (CP)"] + end + iomgr["iomgr
io_uring reactors"] + dev[("Devices")] + + app -->|"init_homeblocks / create_volume"| hbi + app -->|"async_write(vol, addr, sgs)"| io + io --> vol + hbi --> vol + vol --> repl + vol --> idx + repl --> data + idx --> cp + data --> iomgr + iomgr --> dev +``` + +### Project Structure + +``` +HomeBlocks/ +β”œβ”€β”€ src/include/homeblks/ +β”‚ └── home_blocks.hpp # the entire public API (one installed header) +β”œβ”€β”€ src/lib/ +β”‚ β”œβ”€β”€ homeblks_impl.{hpp,cpp} # home_blocks instance: init, shutdown, recovery, reaper +β”‚ β”œβ”€β”€ volume_mgr.cpp # control plane + free-function I/O (async_read/write/unmap) +β”‚ β”œβ”€β”€ volume/ # the volume: index tables, repl-dev I/O, chunk selector, io_req +β”‚ β”œβ”€β”€ listener.{hpp,cpp} # HomeStore repl_dev_listener (on_commit / snapshot hooks) +β”‚ β”œβ”€β”€ memory_backend/ # in-memory variant used by tests +β”‚ β”œβ”€β”€ hb_internal.hpp # internal prelude (LOG* macros, size constants, aliases) β€” not installed +β”‚ └── tests/ # gtest unit + I/O tests +└── conanfile.py +``` + +### Core Abstractions + +- **`home_blocks`** β€” opaque handle to a running instance, produced by `init_homeblocks()`. Owns the volume + control plane (`create_volume` / `remove_volume` / `get_volume` / `volume_ids` / stats). +- **`volume_handle`** (`std::shared_ptr`) β€” opaque handle to one volume; the I/O free functions take it. +- **`home_blocks_config`** β€” bring-up config (devices, reactor threads, memory budget, and a cold-boot + `on_svc_id` identity hook). +- **`async_read` / `async_write` / `async_unmap`** β€” free functions over a `volume_handle`; byte-addressed, + scatter-gather, coroutine-returning. +- **`result` / `async_result`** β€” the synchronous and coroutine flavors of the one error surface. + +## πŸ“¦ Using HomeBlocks + +Everything is in one header: + +```cpp +#include +using namespace homeblocks; +``` + +### Bring up an instance + +```cpp +auto hb_res = init_homeblocks(home_blocks_config{ + .devices = {{"/dev/nvme0n1"}, {"/dev/nvme1n1"}}, + .threads = 2, + // Optional: on first boot (no persisted svc id) HomeBlocks calls this to fetch/assign one -- e.g. a gRPC + // to an orchestrator. Resolve the (possibly rotated) client inside the closure. + .on_svc_id = [&](/* */) -> async_result { co_return co_await orch.register_node(); }, +}); +if (!hb_res) { /* hb_res.error().message() β€” e.g. OM unreachable */ return; } +std::shared_ptr hb = *hb_res; +``` + +### Volumes and I/O (in a coroutine) + +```cpp +sisl::async::task demo(std::shared_ptr hb) { + // create_volume hands back the volume; the handle is often discarded and re-fetched with get_volume() later. + auto vol = co_await hb->create_volume(volume_info{uuid, /*size*/ 1ull << 30, /*page_size*/ 4096, "vol1"}); + if (!vol) co_return; // vol.error() + + // Write 8 KiB at byte offset 0. addr/len are RAW BYTE offsets (block-aligned); the sg_list carries the data. + sisl::sg_list sgs{.size = 8192, .iovs = {iovec{buf, 8192}}}; + auto w = co_await async_write(*vol, /*addr=*/0, sgs); + if (!w) { /* w.error(): std::errc::no_space_on_device, volume_error::OFFLINE, ... */ } + + // Fan out independent ops with when_all instead of chaining: + auto [a, b] = co_await sisl::async::when_all(async_read(*vol, 0, rsgs), async_write(*vol2, 4096, wsgs)); +} +``` + +From a non-coroutine context (a gRPC handler, `main`), drive the lazy task to completion off-reactor with +`stdexec::sync_wait(...)`. + +### Error handling + +One type across the surface β€” `std::expected`: + +```cpp +auto v = hb->get_volume(id); +if (!v) { + if (v.error() == volume_error::UNKNOWN_VOLUME) { /* no such volume */ } + else if (v.error() == std::errc::operation_not_supported) { /* restricted mode */ } + return; +} +auto& vol = *v; +``` + +`volume_error` holds only HomeBlocks-specific failures (`UNKNOWN_VOLUME`, `CRC_MISMATCH`, `INDEX_ERROR`, +`INTERNAL_ERROR`, `OFFLINE`); anything with a standard meaning is returned as `std::errc::*` +(`invalid_argument`, `no_space_on_device`, `io_error`, `operation_not_supported`, ...). Both compare directly +against the returned `error_condition`. + +## πŸ› οΈ Development + +### Code Style + +- 4-space indent, 120-column lines, `#pragma once`, C++23. +- Run `./apply-clang-format.sh` before committing (CI enforces it). + +### Naming + +| Element | Convention | Example | +|---|---|---| +| Public API types & functions (`include/homeblks/`) | `lower_snake_case` | `home_blocks`, `volume_handle`, `async_write()` | +| Public factories | `init_*` / free functions | `init_homeblocks()` | +| Internal implementation classes (`src/lib/`) | `PascalCase` | `HomeBlocksImpl`, `VolumeChunkSelector` | +| Methods | `snake_case` | `create_volume()`, `get_volume()` | +| Members | trailing `_` | `vol_map_`, `config_` | + +Implementation types (`HomeBlocksImpl`, the concrete `volume`, the chunk selector) never appear in the installed +header; consumers only ever see opaque handles produced by the factory functions. + +### Workflow + +```bash +# 1. write code + tests +# 2. format +./apply-clang-format.sh +# 3. build + test +conan build . -s build_type=Debug --build missing +``` + +## πŸ§ͺ Testing + +Google Test, built and run as part of `conan build`. The suite covers volume lifecycle/recovery, the data I/O +path (with read-back CRC verification), and the chunk selector: + +```bash +build/Debug/src/lib/volume/tests/test_volume --dev_size_mb=2048 --data_chunk_size_mb=64 --index_chunk_size_mb=64 +build/Debug/src/lib/volume/tests/test_volume_io --dev_size_mb=2048 --data_chunk_size_mb=64 --index_chunk_size_mb=64 +build/Debug/src/lib/volume/tests/test_volume_chunk_selector ``` - $ conan create . + +> Size test devices so the data/index **chunk size** fits β€” the defaults are large; pass +> `--data_chunk_size_mb` / `--index_chunk_size_mb` for small backing files, otherwise the volume's chunk pool +> comes up empty and `create_volume` fails. + +### Exercising a volume as a real block device (ublk) + +Beyond the gtest suite, `src/test/` ships a small adapter that exposes a HomeBlocks volume as a Linux +[ublk](https://docs.kernel.org/block/ublk.html) device (`/dev/ublkbN`) β€” so you can point **standard block +tooling** (`fio`, `dd`, `mkfs`, `mount`) straight at the data path. The `homeblk_ublk` CLI brings up an instance, +creates (or recovers) a volume, and serves its I/O on HomeBlocks' iomgr reactors via +[ublkpp](https://github.com/eBay/ublkpp). It is built as part of the normal test build (ublkpp is a +`test_requires`), landing at `build/Debug/src/test/homeblk_ublk`. + +**Prerequisites:** a `ublk_drv`-capable kernel (β‰₯ 5.19; `sudo modprobe ublk_drv` if `/dev/ublk-control` is +missing) and **root** β€” the control device is root-only. + +```bash +# Bring up HomeBlocks on a backing device and expose a 1 GiB volume. +# --create_device makes the backing store a file of --dev_size_mb (handy for a quick try); omit it to use an +# existing file or raw block device -- which HomeBlocks will FORMAT, destroying its current contents. +sudo build/Debug/src/test/homeblk_ublk \ + --device /var/tmp/hb.dev --create_device --dev_size_mb 8192 \ + --vol_size_mb 1024 --data_chunk_size_mb 512 --index_chunk_size_mb 256 \ + --num_threads 4 -c +# -> prints "homeblocks volume exposed at: /dev/ublkbN" and stays running until Ctrl-C. +``` + +| Option | Meaning | +|---|---| +| `--device [,...]` | HomeBlocks backing device(s). **Formatted on use β€” existing contents are destroyed.** | +| `--create_device` / `--dev_size_mb` | Create the backing path(s) as files of the given size first. | +| `--vol_id ` | Volume to expose β€” recovered if it already exists, else created (default: random). | +| `--vol_size_mb` / `--page_size` | Volume size and logical block size (default 4096) when creating. | +| `--num_threads` | HomeBlocks iomgr reactor count. | +| `--data_chunk_size_mb` / `--index_chunk_size_mb` | HomeStore chunk sizing (see the note below). | +| `--device_id ` | ublk device id: `-1` to assign one, `>=0` to recover a kernel-preserved device. | + +In another terminal, drive I/O at the printed device. All raw I/O is `O_DIRECT` with the volume's page size as +the logical block size, so keep it block-aligned (`--direct=1 bs=4k`): + +```bash +DEV=/dev/ublkbN + +# integrity: write + read-back CRC verify +sudo fio --name=v --filename=$DEV --direct=1 --rw=randwrite --bs=4k --size=512M \ + --ioengine=io_uring --iodepth=32 --verify=crc32c --verify_fatal=1 + +# a real filesystem round-trip +sudo mkfs.ext4 -F $DEV && sudo mount $DEV /mnt && sudo cp -r some/files /mnt && sudo umount /mnt ``` + +**Tear down** by `Ctrl-C`-ing `homeblk_ublk` β€” but **`umount` any filesystem first**: clean shutdown removes the +ublk device, which blocks until it is unmounted. Volume data persists in the backing device; re-expose it later +with the same `--vol_id` and **without** `--create_device`. + +> **Sizing matters for sustained writes.** HomeBlocks is copy-on-write, so sustained *random overwrite* churns +> HomeStore's free-block allocator and checkpoint. Give the volume **many data chunks** (a smaller +> `--data_chunk_size_mb` relative to the volume) and enough `--num_threads`, or per-chunk contention surfaces as +> throughput dips and latency spikes. For a disk-free benchmark, back it with a ramdisk +> (`sudo modprobe brd rd_nr=1 rd_size=$((12*1024*1024))` β†’ `--device /dev/ram0`, no `--create_device`). One fio +> gotcha: `--verify` with `--numjobs>1` over a *shared* range reports false mismatches (writers overwrite each +> other's blocks) β€” give each job a disjoint `--offset_increment` region, or use a single writer. + +## πŸ“¦ Dependencies + +| Dependency | Version | Role | +|---|---|---| +| [HomeStore](https://github.com/eBay/HomeStore) | ^8.0 | replication, index & data services, checkpoints | +| [iomgr](https://github.com/eBay/IOManager) | ^13.0 | io_uring reactor model, drive I/O | +| [sisl](https://github.com/eBay/sisl) | ^14.6 | `async` coroutine substrate, logging, metrics, options | +| [stdexec](https://github.com/NVIDIA/stdexec) | (via sisl) | P2300 sender/receiver; `exec::task` | +| GoogleTest | 1.17 | unit / functional tests | + +Built and packaged with Conan 2; requires C++23. + +## πŸ“š Documentation + +- **[CHANGELOG.md](CHANGELOG.md)** β€” version history. + +## πŸ“„ License + +Licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE). diff --git a/cmake/coverage.cmake b/cmake/coverage.cmake new file mode 100644 index 0000000..932c3d0 --- /dev/null +++ b/cmake/coverage.cmake @@ -0,0 +1,303 @@ +# Copyright (c) 2012 - 2017, Lars Bilke +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CHANGES: +# +# 2012-01-31, Lars Bilke +# - Enable Code Coverage +# +# 2013-09-17, Joakim SΓΆderberg +# - Added support for Clang. +# - Some additional usage instructions. +# +# 2016-02-03, Lars Bilke +# - Refactored functions to use named parameters +# +# 2017-06-02, Lars Bilke +# - Merged with modified version from github.com/ufz/ogs +# +# +# USAGE: +# +# 1. Copy this file into your cmake modules path. +# +# 2. Add the following line to your CMakeLists.txt: +# include(CodeCoverage) +# +# 3. Append necessary compiler flags: +# APPEND_COVERAGE_COMPILER_FLAGS() +# +# 4. If you need to exclude additional directories from the report, specify them +# using the COVERAGE_LCOV_EXCLUDES variable before calling SETUP_TARGET_FOR_COVERAGE_LCOV. +# Example: +# set(COVERAGE_LCOV_EXCLUDES 'dir1/*' 'dir2/*') +# +# 5. Use the functions described below to create a custom make target which +# runs your test executable and produces a code coverage report. +# +# 6. Build a Debug build: +# cmake -DCMAKE_BUILD_TYPE=Debug .. +# make +# make my_coverage_target +# + +include(CMakeParseArguments) + +# Check prereqs +find_program( GCOV_PATH gcov ) +find_program( LCOV_PATH NAMES lcov lcov.bat lcov.exe lcov.perl) +find_program( GENHTML_PATH NAMES genhtml genhtml.perl genhtml.bat ) +find_program( GCOVR_PATH gcovr PATHS ${CMAKE_SOURCE_DIR}/scripts/test) +find_program( SIMPLE_PYTHON_EXECUTABLE python ) + +if(NOT GCOV_PATH) + message(FATAL_ERROR "gcov not found! Aborting...") +endif() # NOT GCOV_PATH + +if("${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?[Cc]lang") + if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 3) + message(FATAL_ERROR "Clang version must be 3.0.0 or greater! Aborting...") + endif() +elseif(NOT CMAKE_COMPILER_IS_GNUCXX) + message(FATAL_ERROR "Compiler is not GNU gcc! Aborting...") +endif() + +set(COVERAGE_COMPILER_FLAGS "-g -O0 --coverage -fprofile-arcs -ftest-coverage" + CACHE INTERNAL "") + +set(CMAKE_CXX_FLAGS_COVERAGE + ${COVERAGE_COMPILER_FLAGS} + CACHE STRING "Flags used by the C++ compiler during coverage builds." + FORCE ) +set(CMAKE_C_FLAGS_COVERAGE + ${COVERAGE_COMPILER_FLAGS} + CACHE STRING "Flags used by the C compiler during coverage builds." + FORCE ) +set(CMAKE_EXE_LINKER_FLAGS_COVERAGE + "" + CACHE STRING "Flags used for linking binaries during coverage builds." + FORCE ) +set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE + "" + CACHE STRING "Flags used by the shared libraries linker during coverage builds." + FORCE ) +mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_EXE_LINKER_FLAGS_COVERAGE + CMAKE_SHARED_LINKER_FLAGS_COVERAGE ) + +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + message(WARNING "Code coverage results with an optimised (non-Debug) build may be misleading") +endif() # NOT CMAKE_BUILD_TYPE STREQUAL "Debug" + +if(CMAKE_C_COMPILER_ID STREQUAL "GNU") + link_libraries(gcov) +else() + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage") +endif() + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE_LCOV( +# NAME testrunner_coverage # New target name +# EXECUTABLE testrunner -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES testrunner # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE_LCOV) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT LCOV_PATH) + message(FATAL_ERROR "lcov not found! Aborting...") + endif() # NOT LCOV_PATH + + if(NOT GENHTML_PATH) + message(FATAL_ERROR "genhtml not found! Aborting...") + endif() # NOT GENHTML_PATH + + # Setup target + add_custom_target(${Coverage_NAME} + + # Cleanup lcov + COMMAND ${LCOV_PATH} --directory . --zerocounters + # Create baseline to make sure untouched files show up in the report + COMMAND ${LCOV_PATH} -c -i -d . -o ${Coverage_NAME}.base + + # Run tests + COMMAND ${Coverage_EXECUTABLE} + + # Capturing lcov counters and generating report + COMMAND ${LCOV_PATH} --directory . --capture --output-file ${Coverage_NAME}.info + # add baseline counters + COMMAND ${LCOV_PATH} -a ${Coverage_NAME}.base -a ${Coverage_NAME}.info --output-file ${Coverage_NAME}.total + COMMAND ${LCOV_PATH} --remove ${Coverage_NAME}.total ${COVERAGE_LCOV_EXCLUDES} --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + COMMAND ${GENHTML_PATH} -o ${Coverage_NAME} ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + COMMAND ${CMAKE_COMMAND} -E remove ${Coverage_NAME}.base ${Coverage_NAME}.total ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Resetting code coverage counters to zero.\nProcessing code coverage counters and generating report." + ) + + # Show where to find the lcov info report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Lcov code coverage info report saved in ${Coverage_NAME}.info." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE_LCOV + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE_GCOVR_XML( +# NAME ctest_coverage # New target name +# EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES executable_target # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE_GCOVR_XML) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT SIMPLE_PYTHON_EXECUTABLE) + message(FATAL_ERROR "python not found! Aborting...") + endif() # NOT SIMPLE_PYTHON_EXECUTABLE + + if(NOT GCOVR_PATH) + message(FATAL_ERROR "gcovr not found! Aborting...") + endif() # NOT GCOVR_PATH + + # Combine excludes to several -e arguments + set(GCOVR_EXCLUDES "") + foreach(EXCLUDE ${COVERAGE_GCOVR_EXCLUDES}) + list(APPEND GCOVR_EXCLUDES "-e") + list(APPEND GCOVR_EXCLUDES "${EXCLUDE}") + endforeach() + + add_custom_target(${Coverage_NAME} + # Run tests + ${Coverage_EXECUTABLE} + + # Running gcovr + COMMAND ${GCOVR_PATH} --xml + -r ${PROJECT_SOURCE_DIR} ${GCOVR_EXCLUDES} + --object-directory=${PROJECT_BINARY_DIR} + -o ${Coverage_NAME}.xml + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Running gcovr to produce Cobertura code coverage report." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Cobertura code coverage report saved in ${Coverage_NAME}.xml." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE_GCOVR_XML + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE_GCOVR_HTML( +# NAME ctest_coverage # New target name +# EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES executable_target # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE_GCOVR_HTML) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT SIMPLE_PYTHON_EXECUTABLE) + message(FATAL_ERROR "python not found! Aborting...") + endif() # NOT SIMPLE_PYTHON_EXECUTABLE + + if(NOT GCOVR_PATH) + message(FATAL_ERROR "gcovr not found! Aborting...") + endif() # NOT GCOVR_PATH + + # Combine excludes to several -e arguments + set(GCOVR_EXCLUDES "") + foreach(EXCLUDE ${COVERAGE_GCOVR_EXCLUDES}) + list(APPEND GCOVR_EXCLUDES "-e") + list(APPEND GCOVR_EXCLUDES "${EXCLUDE}") + endforeach() + + add_custom_target(${Coverage_NAME} + # Run tests + ${Coverage_EXECUTABLE} + + # Create folder + COMMAND ${CMAKE_COMMAND} -E make_directory ${PROJECT_BINARY_DIR}/${Coverage_NAME} + + # Running gcovr + COMMAND ${GCOVR_PATH} --html --html-details + -r ${PROJECT_SOURCE_DIR} ${GCOVR_EXCLUDES} + --object-directory=${PROJECT_BINARY_DIR} + -o ${Coverage_NAME}/index.html + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Running gcovr to produce HTML code coverage report." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE_GCOVR_HTML + +function(APPEND_COVERAGE_COMPILER_FLAGS) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) + message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}") +endfunction() # APPEND_COVERAGE_COMPILER_FLAGS diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake new file mode 100644 index 0000000..84c9b4b --- /dev/null +++ b/cmake/sanitize.cmake @@ -0,0 +1,9 @@ +if (SANITIZER_TYPE STREQUAL "thread") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread -g -O1 -fno-omit-frame-pointer -Wno-error=tsan") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread") + message(STATUS "Thread Sanitizer enabled") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fsanitize=undefined -fsanitize-address-use-after-scope -fno-sanitize=alignment -DCDS_ADDRESS_SANITIZER_ENABLED -fno-omit-frame-pointer -fno-optimize-sibling-calls") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -fsanitize=undefined") + message(STATUS "Address + Undefined Behavior Sanitizers enabled") +endif() diff --git a/conanfile.py b/conanfile.py index e5de4c3..9ef1126 100644 --- a/conanfile.py +++ b/conanfile.py @@ -10,7 +10,7 @@ class HomeBlocksConan(ConanFile): name = "homeblocks" - version = "5.1.0" + version = "6.0.1" homepage = "https://github.com/eBay/HomeBlocks" description = "Block Store built on HomeStore" @@ -24,7 +24,7 @@ class HomeBlocksConan(ConanFile): "shared": ['True', 'False'], "fPIC": ['True', 'False'], "coverage": ['True', 'False'], - "sanitize": ['True', 'False'], + "sanitize": ['address', 'thread', 'False'], "fixed_index": [True, False], } default_options = { @@ -40,16 +40,20 @@ class HomeBlocksConan(ConanFile): def configure(self): if self.options.shared: self.options.rm_safe("fPIC") - if self.options.coverage and self.options.sanitize: - raise ConanInvalidConfiguration("Sanitizer does not work with Code Coverage!") + if self.settings.build_type == "Debug": + if self.options.coverage and self.options.sanitize != 'False': + raise ConanInvalidConfiguration("Sanitizer does not work with Code Coverage!") + else: + self.options['sisl/*'].malloc_impl = 'tcmalloc' def build_requirements(self): - self.test_requires("gtest/1.17.0") + self.test_requires("gtest/[^1.17]") + self.test_requires("ublkpp/[^0.33]@oss/main") def requirements(self): - self.requires("homestore/[^7.5]", transitive_headers=True) - self.requires("iomgr/[^12.0]", transitive_headers=True) - self.requires("sisl/[^13.2]", transitive_headers=True) + self.requires("homestore/[^8.0]@oss/dev", transitive_headers=True) + self.requires("iomgr/[^13.0]@oss/dev", transitive_headers=True) + self.requires("sisl/[^14.6]@oss/dev", transitive_headers=True) def validate(self): if self.info.settings.compiler.cppstd: @@ -57,8 +61,8 @@ def validate(self): def layout(self): self.folders.source = "." - if self.options.get_safe("sanitize"): - self.folders.build = join("build", "Sanitized") + if self.options.get_safe("sanitize") and self.options.sanitize != "False": + self.folders.build = join("build", f"Sanitized-{self.options.sanitize}") elif self.options.get_safe("coverage"): self.folders.build = join("build", "Coverage") else: @@ -69,7 +73,7 @@ def layout(self): self.cpp.build.libdirs = ["src/lib/volume"] - self.cpp.package.libs = ["homeblocks_volume"] + self.cpp.package.libs = ["homeblocks"] self.cpp.package.includedirs = ["include"] # includedirs is already set to 'include' by self.cpp.package.libdirs = ["lib"] @@ -85,12 +89,16 @@ def generate(self): tc.variables["USE_FIXED_INDEX"] = "ON" if self.options.fixed_index else "OFF" if self.settings.build_type == "Debug": if self.options.get_safe("coverage"): - tc.variables['CODE_COVERAGE'] = 'ON' - elif self.options.get_safe("sanitize"): - tc.variables['MEMORY_SANITIZER_ON'] = 'ON' + tc.variables['BUILD_COVERAGE'] = 'ON' + elif self.options.get_safe("sanitize") and self.options.sanitize != "False": + if self.options.sanitize == "thread": + tc.variables['THREAD_SANITIZER_ON'] = 'ON' + else: # address + tc.variables['ADDRESS_SANITIZER_ON'] = 'ON' + if self.settings.build_type != "Debug": + tc.variables['TCMALLOC_ON'] = 'ON' tc.generate() - # This generates "boost-config.cmake" and "grpc-config.cmake" etc in self.generators_folder deps = CMakeDeps(self) deps.generate() @@ -102,6 +110,9 @@ def build(self): jobs = self.conf.get("tools.build:jobs", default=3) env = Environment() env.define("CTEST_PARALLEL_LEVEL", str(jobs)) + if self.options.get_safe("sanitize") == "thread": + suppression_file = join(self.source_folder, "src", "test", "tsan_suppressions.txt") + env.define("TSAN_OPTIONS", f"suppressions={suppression_file}:second_deadlock_stack=1") with env.vars(self).apply(): cmake.test() @@ -118,10 +129,17 @@ def package(self): copy(self, "*.h*", join(self.source_folder, "src", "include"), join(self.package_folder, "include"), keep_path=True) def package_info(self): - if self.options.sanitize: - self.cpp_info.sharedlinkflags.append("-fsanitize=address") - self.cpp_info.exelinkflags.append("-fsanitize=address") - self.cpp_info.sharedlinkflags.append("-fsanitize=undefined") - self.cpp_info.exelinkflags.append("-fsanitize=undefined") - elif self.options.coverage == 'True': - self.cpp_info.libs.append('gcov') + if self.options.get_safe("sanitize") and self.options.sanitize != "False": + if self.options.sanitize == "thread": + self.cpp_info.sharedlinkflags.append("-fsanitize=thread") + self.cpp_info.exelinkflags.append("-fsanitize=thread") + else: + self.cpp_info.sharedlinkflags.append("-fsanitize=address") + self.cpp_info.exelinkflags.append("-fsanitize=address") + self.cpp_info.sharedlinkflags.append("-fsanitize=undefined") + self.cpp_info.exelinkflags.append("-fsanitize=undefined") + + self.cpp_info.set_property("cmake_file_name", "HomeBlocks") + self.cpp_info.set_property("cmake_target_name", "HomeBlocks::HomeBlocks") + self.cpp_info.names["cmake_find_package"] = "HomeBlocks" + self.cpp_info.names["cmake_find_package_multi"] = "HomeBlocks" diff --git a/docs/craft/README.md b/docs/craft/README.md new file mode 100644 index 0000000..e484fc9 --- /dev/null +++ b/docs/craft/README.md @@ -0,0 +1,45 @@ +# CRAFT β€” Client Assisted RAFT + +**CRAFT** (Client Assisted RAFT) is the replication protocol for NuBlox 2.0. It separates the +data path from the consensus path: clients broadcast writes directly to all replicas at +client-assigned LSNs, while RAFT is used only for leader election, login synchronization, and +recovery bookkeeping. Write data never flows through the RAFT log. + +## Documents + +| File | Contents | +|---|---| +| [protocol.md](protocol.md) | Full protocol β€” leader election, login, IO phase, failure/resync | +| [api.md](api.md) | HomeBlocks C++ CRAFT API (`CraftReplDev` methods) | +| [rpcs.md](rpcs.md) | All 8 RPCs (client↔server and server↔server) | +| [states.md](states.md) | LSN state machines (client view and replica view) | +| [subtasks.md](subtasks.md) | Implementation sub-task breakdown (SDSTOR-22382 children) | + +## Glossary + +| Term | Definition | +|---|---| +| **CRAFT** | Client Assisted RAFT β€” the NuBlox 2.0 replication protocol | +| **dLSN** | Data LSN β€” a monotonically increasing sequence number in the **data journal** of a single partition/replica-set. Per-volume in NuBlox. | +| **gLSN** | Global LSN β€” monotonically increasing across all partitions of a volume. | +| **rLSN** | RAFT LSN β€” the index within the RAFT log. Distinct from dLSN. | +| **term** | RAFT term number, incremented on every new client login. Used by replicas to reject stale IOs. | +| **commit_lsn** | Highest dLSN whose data has been applied to the state machine (index + block map). A committed write is readable. | +| **last_append_lsn** | Highest dLSN whose data has been written to the data journal (possibly not yet committed). | +| **Replica Set (RS)** | The set of HomeBlocks nodes that hold copies of one partition. Typically 3 nodes. | +| **Partition** | A contiguous region of a Volume, replicated across one Replica Set. In NuBlox, partition β‰ˆ volume. | +| **CraftReplDev** | New HomeBlocks replication device class (parallel to `ReplDisk`) that implements CRAFT. | +| **CraftConnector** | New HomeBlocks RPC frontend (parallel to `ScstConnector`) that translates NubloxProto RPCs to `CraftReplDev` API calls. | +| **SyncRSCommitLSN** | A RAFT log entry type. On apply, each replica fetches any missing data up to the encoded dLSN and advances `commit_lsn`. | +| **InternalLogin** | A RAFT log entry type. On apply, stores the new `client_token` and enforces single-writer exclusivity. | +| **Missing** | A dLSN slot that a replica knows about (from a peer or from the RAFT log) but has not yet received data for. | +| **Empty** | A dLSN that was never received by any replica and is not discoverable during resync. Treated as a no-op hole. | + +## Key design properties + +- **Single writer**: only one client at a time owns a partition (enforced by `InternalLogin` RAFT entry). +- **Leaderless data path**: after login, the RAFT leader has no special role for writes or reads. +- **Client drives commit**: replicas do not commit until told by the client (via `commit` RPC or `min_commit_lsn` in a `read` RPC). +- **Server-side resync**: `SyncRSCommitLSN` lets replicas catch up from each other without client involvement. +- **No HomeStore changes needed**: `CraftReplDev` is built entirely on top of existing HomeStore journal/index/block primitives. +- **Full replacement**: `CraftReplDev` replaces the existing solo `ReplDev` for all volumes. There are no non-CRAFT (ReplDisk/solo) volumes in the final design. diff --git a/docs/craft/api.md b/docs/craft/api.md new file mode 100644 index 0000000..12a894f --- /dev/null +++ b/docs/craft/api.md @@ -0,0 +1,219 @@ +# HomeBlocks CRAFT C++ API + +`CraftReplDev` is a new class (parallel to HomeStore's `ReplDisk`) that each CRAFT-mode +volume owns instead of a solo `repl_dev`. It exposes the following methods, which +`CraftConnector` calls 1-to-1 when translating incoming NubloxProto RPCs. + +All methods are async/coroutine-style (`async_result` or `async_status`) matching the +existing HomeBlocks convention. + +--- + +## Per-partition in-memory state + +```cpp +struct CraftPartitionState { + int64_t commit_lsn {-1}; // highest committed dLSN + int64_t last_append_lsn {-1}; // highest appended dLSN (may be uncommitted) + uint64_t client_token {0}; // token from last successful InternalLogin + uint64_t term {0}; // current RAFT term +}; +``` + +This state is authoritative in memory and recovered from the journal + superblock on restart. + +--- + +## Client-facing API + +### `login` + +```cpp +struct LoginResult { + std::vector members; + int64_t dLSN; // starting LSN for new IO + int64_t gLSN; // global (volume-level) LSN + uint64_t term; +}; + +async_result +login(uint64_t client_token, volume_id_t vol_id); +``` + +Leader-only. Orchestrates the full login sequence: +1. `GetRSCommitLSN` broadcast to all peers (non-RAFT) +2. `FetchData` from an ahead peer if the leader is behind (non-RAFT) +3. Propose `SyncRSCommitLSN(rs_commit_lsn)` via RAFT +4. Propose `InternalLogin(client_token, new_term)` via RAFT +5. Return `LoginResult` after both RAFT entries commit + +**Preconditions:** caller is the RAFT leader. +**Postconditions:** all quorum members have `commit_lsn == rs_commit_lsn`; all reject IOs +from any token other than `client_token`. + +--- + +### `write` + +```cpp +async_status +write(uint64_t term, int64_t lsn, int64_t glsn, + lba_t lba, lba_count_t len, sisl::sg_list data); +``` + +Appends `data` to the data journal at slot `lsn`. Zero-copy required on the hot path. + +Steps: +1. Reject if `term != state.term` β†’ `ETERM`. +2. Write `data` to the journal at position `lsn` (may be out of order). +3. `state.last_append_lsn = max(state.last_append_lsn, lsn)`. +4. ACK. + +Does **not** apply data to the LBA index; that happens on `commit`. + +--- + +### `read` + +```cpp +async_result +read(uint64_t term, int64_t min_commit_lsn, lba_t lba, lba_count_t len); +``` + +If `state.commit_lsn < min_commit_lsn`: commit inline up to `min_commit_lsn` before +serving. Then read from the committed state machine (LBA index β†’ block read). + +Rejects if `term != state.term`. + +--- + +### `commit` + +```cpp +async_status +commit(uint64_t term, int64_t lsn); +``` + +Advance `commit_lsn` to `lsn`: apply all journal entries in `(current_commit, lsn]` to the +state machine (update LBA index, finalize block map). After this call, LBAs covered by +those entries are readable. + +--- + +### `keep_alive` + +```cpp +async_status +keep_alive(int64_t commit_lsn); +``` + +Same as `commit` plus resets the client-timeout watchdog. Sent periodically by the client +even during idle periods to prevent the server from triggering `SyncRSCommitLSN`. + +--- + +### `get_lsns` + +```cpp +struct LSNPair { int64_t commit_lsn; int64_t last_append_lsn; }; + +async_result +get_lsns(volume_id_t vol_id); +``` + +Returns `{commit_lsn, last_append_lsn}` for the local partition. Used by peers via +`GetRSCommitLSN` during login and by the leader during `SyncRSCommitLSN`. + +--- + +### `truncate` + +```cpp +async_status +truncate(int64_t lsn); +``` + +Drop all journal entries with dLSN > `lsn`. Called when a replica discovers it has +entries from a previous term that did not reach quorum (new `InternalLogin` forces +a truncate of stale appended entries). Also called during login to clean up followers +whose `last_append > agreed_dLSN`. + +--- + +## Internal / RAFT-entry API + +### `append` (propose SyncRSCommitLSN) + +```cpp +async_status +append(int64_t sync_to, uint64_t client_token); +``` + +Proposes a `SyncRSCommitLSN` RAFT entry with value `sync_to`. Callable by the leader's +watchdog or by the client-facing `SyncRSCommitLSN` RPC. `client_token` is embedded so +followers can verify the entry belongs to the current session. + +--- + +### `fetch_data` (for peer resync) + +```cpp +async_result> +fetch_data(std::vector lsns); +``` + +Returns raw journal data for the requested LSNs. Called server-to-server (not from the +client) during `SyncRSCommitLSN` apply when a replica discovers it is behind. + +--- + +### `get_rs_commit_lsn` (for peer query) + +```cpp +async_result +get_rs_commit_lsn(); +``` + +Alias of `get_lsns` exposed to peer servers during the `GetRSCommitLSN` broadcast. + +--- + +## RAFT state machine entries + +These are internal RAFT log entry types, not part of the public API. + +### `SyncRSCommitLSN` + +``` +payload: { rs_commit_lsn: int64 } +``` + +On RAFT apply (each replica): +1. If `last_append_lsn < rs_commit_lsn`: call `fetch_data(missing)` from a peer. +2. `commit_lsn = rs_commit_lsn`. + +### `InternalLogin` + +``` +payload: { client_token: uint64, term: uint64 } +``` + +On RAFT apply (each replica): +1. `state.client_token = client_token` +2. `state.term = term` +3. From this point, reject writes/reads whose `term` field != `state.term`. + +--- + +## Replacing the existing API + +`CraftReplDev` replaces the existing solo `ReplDev` for all volumes. The old +`async_read` / `async_write` surface in `home_blocks.hpp` (consumed by `ScstConnector`) +is superseded. `CraftConnector` is the new frontend; `ScstConnector` is removed. + +| Old API (removed) | CRAFT replacement | +|---|---| +| `async_write(vol, addr, sgs)` | `write(term, lsn, glsn, lba, len, data)` | +| `async_read(vol, addr, sgs)` | `read(term, min_commit_lsn, lba, len)` | +| `async_unmap` (stub) | No equivalent in CRAFT v1 | +| β€” | `login`, `commit`, `keep_alive`, `truncate`, `fetch_data`, `get_lsns`, `append` | diff --git a/docs/craft/protocol.md b/docs/craft/protocol.md new file mode 100644 index 0000000..b555d92 --- /dev/null +++ b/docs/craft/protocol.md @@ -0,0 +1,190 @@ +# CRAFT Protocol + +## Overview + +Four phases: **Leader Election β†’ Login β†’ IO Phase β†’ Failure/Resync**. + +--- + +## Phase 1: Leader Election + +A standard RAFT leader election takes place across all replica set members (S1, S2, S3). +The elected leader handles `login` RPCs from clients. After login, the leader has no special +role in the data path. + +--- + +## Phase 2: Login + +The login sequence establishes a new **term**, synchronizes replica state, and returns the +starting LSN to the client. It must complete before any IO is accepted. + +``` +Client ──login(client_token, vol_id)──► Leader (S1) + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ GetRSCommitLSN β”‚ ← non-RAFT broadcast to all peers + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + S2: [commit=5, append=7] + S3: [commit=10, append=11] + β”‚ + Compute quorum's max dLSN + (e.g., use last_append from quorum) + β”‚ + If leader is behind: + β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ FetchData(N) β”‚ ← unicast to ahead peer + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + append fetched data to own journal + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ SyncRSCommitLSN(N) β”‚ ← RAFT proposal (rLSN++) + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + On RAFT commit, each follower: + β€’ checks own last_append vs N + β€’ fetches missing data if behind + β€’ advances commit_lsn to N + Replicas with append > N: truncate(N) + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ InternalLogin(token, term)β”‚ ← RAFT proposal + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + On RAFT commit: + β€’ all replicas store client_token + β€’ reject IOs from any other client + β”‚ + Client ◄── [members, dLSN=N, term=T, gLSN=G] +``` + +**Login response fields:** +- `members` β€” endpoints of all replica set members +- `dLSN` β€” starting LSN for new IO; minimum for any member to accept reads/writes +- `term` β€” current term number; client includes this on every IO +- `gLSN` β€” global LSN (volume-scoped in NuBlox: gLSN = volume LSN, dLSN = partition LSN) + +After a successful login, the client opens one queue per replica member and begins IO. + +--- + +## Phase 3: IO Phase + +### Write + +The client assigns the next dLSN (`++next_lsn`) and broadcasts the write to all replicas. + +``` +Client ──write(term, lsn, glsn, lba, len, data)──► S1, S2, S3 (broadcast) + β”‚ + β”‚ Each replica: + β”‚ 1. Validates term == current term + β”‚ 2. Appends data to data journal at dLSN slot + β”‚ 3. Updates last_append_lsn = max(last_append_lsn, lsn) + β”‚ 4. ACKs client + β”‚ + β”œβ”€ Quorum ACKs received β†’ write is durable ("Appended" state) + β”‚ β€’ Client can ACK the application layer + β”‚ β€’ Data is NOT yet readable (not committed) + β”‚ + └─ Commit is sent lazily (via `commit` or piggy-backed on next write / read) +``` + +**Out-of-order tolerance:** Replicas may receive writes out of order. A write that arrives +before its predecessor is stored but tracked as a "Missing" predecessor (the replica records +the gap). Writes within the same client session are globally ordered by dLSN. + +**Overlapping writes:** If two writes overlap LBA ranges, the one with the lower dLSN must +be committed before the other can be read. The client serializes overlapping writes. + +### Commit + +``` +Client ──commit(term, lsn)──► S1, S2, S3 (broadcast) +``` + +Instructs replicas to advance `commit_lsn` to `lsn` and apply journal entries ≀ lsn +to the state machine (LBA index update, block map finalization). After commit, the data +is readable. + +Commit may be piggybacked on the next write or on a keep_alive. + +### Read + +``` +Client ──read(term, min_commit_lsn, lba, len)──► chosen replica (unicast) +``` + +The client chooses a replica (round-robin, filtered for staleness) and sends the +`min_commit_lsn` it wants the replica to have committed before serving the read. + +The replica: +1. If `commit_lsn < min_commit_lsn`: applies journal entries up to `min_commit_lsn` inline. +2. Serves the read from the state machine (LBA index β†’ block read). + +**Replica selection:** The client tracks a per-replica "Missing" set (LSNs whose writes were +acknowledged before the read was issued but that the replica has not yet received). If a +replica's Missing set overlaps the read's LBA range, skip to the next replica. For large +reads spanning multiple LSNs, the read may be split across replicas. + +### KeepAlive + +``` +Client ──keep_alive(commit_lsn)──► S1, S2, S3 (broadcast, periodic) +``` + +Advances `commit_lsn` and resets the client-timeout watchdog on each replica. +If no keepalive is received within the watchdog period, the leader initiates +`SyncRSCommitLSN` (see Failure/Resync below). + +--- + +## Phase 4: Failure / Resync + +### SyncRSCommitLSN + +Triggered by: login (always), client-timeout watchdog, or periodic checkpoint (every N LSNs, +configurable, default N=128). + +``` +Leader (S1): + 1. Broadcast GetRSCommitLSN(my_commit, my_last_append) β†’ all peers + 2. Collect [commit_lsn, last_append_lsn] from quorum + 3. Determine rs_commit_lsn = max(quorum's last_append) + (can use commit_lsn instead for a more conservative choice) + 4. If leader.last_append < rs_commit_lsn: + FetchData(missing_lsns) from an ahead peer β†’ append to own journal + 5. Propose SyncRSCommitLSN(rs_commit_lsn) via RAFT + +Each replica on RAFT apply: + β€’ If last_append < rs_commit_lsn: + FetchData(missing_lsns) from a peer β†’ append to local journal + β€’ commit_lsn = rs_commit_lsn +``` + +### New Term (client crash / reconnect) + +When a new client logs in, a new term T+1 is established. + +Replicas with `last_append > dLSN` from the previous term **truncate** those entries: +`truncate(dLSN)` discards journal entries above the agreed starting LSN. + +An LSN that was never received by any replica and is not discoverable via FetchData is +marked **Empty** β€” not an error; it is treated as a hole. + +### Periodic RAFT Checkpointing + +During IO, the client periodically appends a `SyncRSCommitLSN` entry to the RAFT log +(every N=128 LSNs by default). This gives offline replicas a catch-up anchor without +waiting for the next login. + +--- + +## Invariants + +1. A write is **durable** once quorum has appended it to their journals. +2. A write is **readable** only after it has been committed (`commit_lsn β‰₯ lsn`). +3. `commit_lsn ≀ last_append_lsn` always. +4. Only one client (identified by `client_token`) may issue writes in a given term. +5. A replica will not serve reads from any LBA range that has a Missing predecessor at + a lower dLSN (stale replica must be synced first). +6. Truncation only removes entries **above** the agreed `dLSN`; entries at or below are + never discarded. diff --git a/docs/craft/rpcs.md b/docs/craft/rpcs.md new file mode 100644 index 0000000..fe63923 --- /dev/null +++ b/docs/craft/rpcs.md @@ -0,0 +1,152 @@ +# CRAFT RPCs + +8 RPCs total. 4 client↔server, 4 server↔server (2 via RAFT, 2 non-RAFT). +RAFT internal RPCs (heartbeat, vote, membership) are not listed here. + +--- + +## Client β†’ Server + +### 1. Login (Unicast to leader) + +``` +Request: { client_token: string | uint64 } +Response: { members: [endpoint], dLSN: int64, term: uint64, gLSN: int64 } +``` + +Client sends to the RAFT leader. Leader runs the full login orchestration sequence +(GetRSCommitLSN β†’ optional FetchData β†’ SyncRSCommitLSN RAFT β†’ InternalLogin RAFT) +and responds once both RAFT entries commit. + +HomeBlocks handler: `CraftReplDev::login()` + +--- + +### 2. Write (Broadcast to all replicas) + +``` +Request: { term: uint64, lsn: int64, glsn: int64, lba: uint64, len: uint32, data: bytes } +Response: { status: Status } +``` + +Client sends to every replica in the set in parallel. Each replica appends `data` to its +data journal at slot `lsn` and ACKs immediately. Write is durable once quorum ACKs. +Data is **not** readable until committed. + +Zero-copy is required: `data` must not be copied during journal append. + +HomeBlocks handler: `CraftReplDev::write()` + +--- + +### 3. Read (Unicast to chosen replica) + +``` +Request: { term: uint64, min_commit_lsn: int64, lba: uint64, len: uint32 } +Response: { status: Status, data: bytes } +``` + +Client picks a replica whose Missing set does not overlap `[lba, lba+len)`. If +`min_commit_lsn > replica.commit_lsn`, the replica commits inline first. For large +reads crossing multiple LSNs where no single replica is up-to-date, the client splits +the read across replicas. + +HomeBlocks handler: `CraftReplDev::read()` + +--- + +### 4. Commit (Broadcast to all replicas) + +``` +Request: { term: uint64, lsn: int64 } +Response: { status: Status } +``` + +Tells replicas to advance `commit_lsn` to `lsn`. May be piggybacked on the next +Write or KeepAlive instead of sent as a standalone RPC. + +HomeBlocks handler: `CraftReplDev::commit()` + +--- + +### 5. KeepAlive (Broadcast to all replicas) + +``` +Request: { commit_lsn: int64 } +Response: { status: Status } +``` + +Advances `commit_lsn` and resets the per-replica client-timeout watchdog. Sent +periodically during idle periods and after every quorum-acknowledged write. + +HomeBlocks handler: `CraftReplDev::keep_alive()` + +--- + +## Server β†’ Server (non-RAFT) + +### 6. GetRSCommitLSN (Broadcast, initiated by leader) + +``` +Request: { term: uint64, my_commit_lsn: int64, my_last_append_lsn: int64 } +Response: { term: uint64, commit_lsn: int64, last_append_lsn: int64 } +``` + +Leader sends to all peers to collect their current LSN state before a `SyncRSCommitLSN` +RAFT proposal. Used during login and on timeout. + +HomeBlocks handler: `CraftReplDev::get_rs_commit_lsn()` / `get_lsns()` +Dispatched by: `CraftConnector` (inter-node channel, non-RAFT) + +--- + +### 7. FetchData (Unicast, from behind replica to an ahead peer) + +``` +Request: { lsns: [int64] } +Response: { slots: [{ lsn: int64, lba: uint64, len: uint32, data: bytes }] } +``` + +Called when a replica discovers it is missing data for certain LSNs after receiving a +`SyncRSCommitLSN` RAFT entry. Targets the peer most likely to have the data. + +HomeBlocks handler: `CraftReplDev::fetch_data()` +Dispatched by: `CraftConnector` (inter-node channel, non-RAFT) + +--- + +## Server β†’ Server (RAFT) + +### 8. SyncRSCommitLSN (RAFT proposal, from leader) + +``` +RAFT entry payload: { rs_commit_lsn: int64, client_token: uint64 } +``` + +Proposed by the leader via `CraftReplDev::append()`. On RAFT commit each replica +applies the entry: fetch missing data if behind, then advance `commit_lsn`. This is the +primary recovery mechanism β€” it does not carry data itself, only the LSN watermark. + +--- + +### 9. InternalLogin (RAFT proposal, from leader during login) + +``` +RAFT entry payload: { client_token: uint64, term: uint64 } +``` + +Proposed by the leader after `SyncRSCommitLSN` commits. On apply each replica stores +`client_token` and `term`, rejecting any subsequent IO from a different token. Proposed +immediately after the `SyncRSCommitLSN` entry during the login sequence. + +--- + +## RPC Transport + +The transport layer for NubloxProto RPCs is decided by the **CRAFT-1 spike** (SDSTOR-22297 +dependency). `CraftConnector` is transport-agnostic: it will dispatch via whatever channel +CRAFT-1 selects (likely gRPC or a custom framing over TCP). Server-to-server RPCs (6 and 7) +use the same transport. + +During development, before CRAFT-1 lands, `CraftConnector` can use direct C++ function +calls or a stub transport for unit/integration testing. diff --git a/docs/craft/states.md b/docs/craft/states.md new file mode 100644 index 0000000..f48532a --- /dev/null +++ b/docs/craft/states.md @@ -0,0 +1,92 @@ +# CRAFT LSN State Machines + +--- + +## Client-side write states + +The client tracks each write through the following states: + +``` +Queued/Blocked ──► Pending ──► Appended ──► Committed +``` + +| State | Meaning | +|---|---| +| **Queued / Blocked** | Received from the application block layer; not yet sent to any replica. May be blocked behind an overlapping in-flight write. | +| **Pending** | Sent to replicas; fewer than quorum have ACKed. | +| **Appended** | Quorum of replicas have ACKed (written to their data journals). The write is durable but **not yet readable**. The client may ACK the application. | +| **Committed** | The client has sent a `commit` (or `keep_alive`) for this LSN and at least one replica has applied it to its state machine. The write is readable. | + +Once a write reaches **Committed**, the client drops it from its tracking state. + +--- + +## Replica-side slot states + +Each dLSN slot on a replica is in one of these states: + +| State | Meaning | +|---|---| +| **Appended** | Data received from the client and written to the data journal. Not yet committed. | +| **Committed** | Journal entry applied to the state machine (LBA index updated, block map finalized). Readable. | +| **Empty** | Slot that was never received and was not found on any peer during resync. Treated as a permanent hole; not an error. | +| **Synced** | All LSNs ≀ this slot are committed. Indicates a clean checkpoint. | +| **Missing** | The replica knows the slot should exist (from context: a higher LSN arrived, or a `SyncRSCommitLSN` entry referenced it) but the data has not yet arrived. The replica must fetch this slot before it can commit past it. | + +--- + +## Transitions + +``` + write() RPC received + β”‚ + β–Ό + [Appended] + β”‚ + commit()/keep_alive() received + OR min_commit_lsn in read() β‰₯ this lsn + β”‚ + β–Ό + [Committed] ────────────► readable from state machine + + gap detected (higher LSN arrived first, + or SyncRSCommitLSN references this lsn) + β”‚ + β–Ό + [Missing] + β”‚ + fetch_data() completes + β”‚ + β–Ό + [Appended] ───► (then Committed as above) + + SyncRSCommitLSN applied, slot not on any peer + β”‚ + β–Ό + [Empty] ── permanent hole, skipped in commit advance +``` + +--- + +## Per-replica tracking summary + +Each replica maintains: +- `commit_lsn` β€” highest LSN fully committed to state machine +- `last_append_lsn` β€” highest LSN written to the data journal +- A set of **Missing** LSN slots (gaps between `commit_lsn` and `last_append_lsn`) + +The client additionally tracks: +- `next_lsn` β€” counter for the next write assignment +- Per-replica **Missing** sets (from the client's perspective: writes that reached quorum + but not yet a specific replica) + +--- + +## Read eligibility + +A replica is eligible to serve a read for LBA range `[lba, lba+len)` if: +1. `commit_lsn >= min_commit_lsn` (after inline commit if needed) +2. No **Missing** entry at a dLSN ≀ read's target LSN overlaps the LBA range + +If no single replica satisfies (2) for the full range, the client may split the read across +replicas such that each sub-range is served by an eligible member. diff --git a/docs/craft/subtasks.md b/docs/craft/subtasks.md new file mode 100644 index 0000000..42a033a --- /dev/null +++ b/docs/craft/subtasks.md @@ -0,0 +1,218 @@ +# CRAFT Implementation Sub-tasks + +Epic: [SDSTOR-22382](https://jirap.corp.ebay.com/browse/SDSTOR-22382) β€” CRAFT Server-Side (HB + CraftConnector) + +Two components: +- **Component A**: HomeBlocks CRAFT API (`CraftReplDev` + supporting internals) +- **Component B**: `CraftConnector` (RPC frontend, analogous to `ScstConnector`) + +--- + +## Dependency graph + +``` +S1 (CraftReplDev foundation) +β”œβ”€β”€ S2 (Write path) +β”‚ └── S3 (Commit + Read path) +β”œβ”€β”€ S4 (Truncate) +β”œβ”€β”€ S6 (Peer data exchange APIs) +β”‚ └── S5 (RAFT state machine entries) +β”‚ └── S7 (Login orchestration) ← also needs S2, S4 +└── S8 (CRAFT volume lifecycle) + +S9 (CraftConnector) ← skeleton early, full handlers need S2/S3/S7 +``` + +Parallel tracks after S1 completes: S2+S3, S4, S6+S5+S7, S8, S9-skeleton. + +--- + +## S1 β€” CraftReplDev Foundation +**Jira:** [SDSTOR-22383](https://jirap.corp.ebay.com/browse/SDSTOR-22383) +**Blocks:** everything + +As an I/O engineer, I want a `CraftReplDev` class in HomeBlocks that wraps HomeStore's +journal/index and participates in RAFT only for `SyncRSCommitLSN` and `InternalLogin` +entries, so that CRAFT-mode volumes store write data directly in the journal rather than +through RAFT log entries. + +**Acceptance criteria:** +- `CraftReplDev` exists as a class parallel to (and eventually replacing) `ReplDisk` usage +- Maintains per-partition in-memory state: `commit_lsn`, `last_append_lsn`, `client_token`, `term` +- Provides journal-slot append / read / truncate primitives consumed by all other CRAFT stories +- RAFT group initialized with real member list (not solo); participates in leader election +- RAFT participation limited to: leader election + `SyncRSCommitLSN` entries + `InternalLogin` entries +- Unit-testable with a mock HomeStore journal backend +- All volumes use `CraftReplDev`; the existing solo `ReplDev` is removed + +**Key files to create/modify:** +- `src/lib/craft/craft_repl_dev.hpp` / `.cpp` (new) +- `src/lib/homeblks_impl.cpp` β€” wire up `CraftReplDev` on volume create when craft mode active +- `src/include/homeblks/home_blocks.hpp` β€” `volume_info` gains `replication_mode` field +- `src/tests/craft/` β€” mock backend + unit tests + +--- + +## S2 β€” Write Path +**Blocks:** S3, S7 + +As an I/O engineer, I want `CraftReplDev::write()` to append client-assigned LSN writes to +the HomeStore data journal (zero-copy, out-of-order tolerant), so that replicas can +independently journal writes broadcast by the client. + +**Acceptance criteria:** +- `write(term, lsn, glsn, lba, len, data)` appends to the journal at the given `lsn` slot +- Rejects with `ETERM` if `term != state.term` +- Updates `last_append_lsn = max(last_append_lsn, lsn)` +- Handles out-of-order LSN arrival without blocking (gaps tracked as Missing) +- Zero-copy: `data` buffer is not copied during the append path +- Does NOT apply data to the LBA index (that is `commit`'s job) +- Unit tests cover: in-order writes, out-of-order writes, term rejection + +--- + +## S3 β€” Commit and Read Path +**Blocked by:** S2 +**Blocks:** S7, S9 (full) + +As an I/O engineer, I want `CraftReplDev::commit()`, `keep_alive()`, and `read()` so that +clients can make writes readable and serve reads with inline commit guarantees. + +**Acceptance criteria:** +- `commit(term, lsn)`: applies journal entries `(current_commit, lsn]` to the LBA index and block map; advances `commit_lsn` +- `keep_alive(commit_lsn)`: same as commit + resets the client-timeout watchdog timer +- `read(term, min_commit_lsn, lba, len)`: if `commit_lsn < min_commit_lsn`, commits inline first; then serves from the state machine via the existing `read_from_index` path +- All three reject with `ETERM` if `term != state.term` +- Watchdog timer: if no `keep_alive` or `write` arrives within configurable timeout, trigger `SyncRSCommitLSN` (see S5) +- Unit tests cover: commit-then-read, inline commit on read, commit ordering, watchdog fire + +--- + +## S4 β€” Truncate Path +**Blocked by:** S1 +**Blocks:** S7 + +As an I/O engineer, I want `CraftReplDev::truncate(lsn)` to drop journal entries above the +given LSN, so that replicas can clean up stale writes from a previous term when a new login +starts. + +**Acceptance criteria:** +- `truncate(lsn)` removes all data journal entries with dLSN > `lsn` +- Updates `last_append_lsn = min(last_append_lsn, lsn)` +- Clears any Missing-set entries above `lsn` +- Does not affect entries ≀ `lsn` +- Safe to call concurrently with in-flight reads at committed LSNs ≀ `lsn` +- Unit tests cover: truncate with committed entries below, truncate with missing entries, idempotency + +--- + +## S5 β€” RAFT State Machine Entries (SyncRSCommitLSN + InternalLogin) +**Blocked by:** S1, S6 +**Blocks:** S7 + +As an I/O engineer, I want `SyncRSCommitLSN` and `InternalLogin` RAFT log entry types +implemented in `CraftReplDev`, so that replicas can converge on a consistent LSN watermark +and enforce single-writer exclusivity without data flowing through the RAFT log. + +**Acceptance criteria:** + +**SyncRSCommitLSN:** +- RAFT entry carries `{rs_commit_lsn, client_token}` +- On apply: if `last_append_lsn < rs_commit_lsn`, call `fetch_data()` for missing LSNs from a peer; then set `commit_lsn = rs_commit_lsn` +- `append(sync_to, client_token)` proposes this entry via RAFT +- Periodic auto-fire: every N LSNs (configurable via `home_blks_config.fbs`, default 128) + +**InternalLogin:** +- RAFT entry carries `{client_token, term}` +- On apply: `state.client_token = client_token`, `state.term = term` +- All subsequent IO with a different `term` is rejected +- Proposed immediately after `SyncRSCommitLSN` commits during login + +**Unit tests:** mock RAFT; verify apply callbacks for both entry types; verify fetch_data is +called when behind; verify term enforcement after InternalLogin. + +--- + +## S6 β€” Peer Data Exchange APIs +**Blocked by:** S1 +**Blocks:** S5 + +As an I/O engineer, I want `get_rs_commit_lsn()` / `get_lsns()` and `fetch_data()` on +`CraftReplDev`, so that the leader can poll peers during login/sync and lagging replicas can +pull missing journal data during `SyncRSCommitLSN` apply. + +**Acceptance criteria:** +- `get_lsns(vol_id)` / `get_rs_commit_lsn()` returns `{commit_lsn, last_append_lsn}` for the local partition +- `fetch_data(lsns)` reads raw journal data for the requested LSNs and returns it (without applying to state machine) +- These are called server-to-server via `CraftConnector` (see S9); stub the transport for unit tests +- `fetch_data` must handle: LSNs that are committed, LSNs that are only appended, LSNs that are Empty (return a sentinel, not an error) +- Unit tests cover: normal fetch, fetch across commit boundary, fetch of Empty slot + +--- + +## S7 β€” Login Orchestration (Leader-side) +**Blocked by:** S2, S4, S5, S6 +**Blocks:** S9 (full) + +As an I/O engineer, I want `CraftReplDev::login()` to orchestrate the full login sequence on +the RAFT leader, so that a new client attachment establishes a consistent starting LSN and +term across the replica set. + +**Acceptance criteria:** +- Implements the leader-side login sequence: + 1. Broadcast `GetRSCommitLSN` to all peers; collect `{commit_lsn, last_append_lsn}` + 2. Compute `rs_commit_lsn` (quorum's max `last_append_lsn`, or `commit_lsn` if conservative) + 3. If `self.last_append_lsn < rs_commit_lsn`: `fetch_data(missing)` from ahead peer, append to own journal + 4. Propose `SyncRSCommitLSN(rs_commit_lsn)` via RAFT; wait for commit + 5. Propose `InternalLogin(client_token, term+1)` via RAFT; wait for commit + 6. Replicas with `last_append > rs_commit_lsn` receive `truncate(rs_commit_lsn)` call + 7. Return `{members, dLSN=rs_commit_lsn, term=term+1, gLSN}` +- Returns `ENOTLEADER` if called on a follower +- Login is serialized (only one in-flight login per partition at a time) +- Integration test: 3-node mock cluster; login with divergent replica state; verify all replicas converge + +--- + +## S8 β€” CRAFT Volume Lifecycle +**Blocked by:** S1 + +As an I/O engineer, I want HomeBlocks to create and recover CRAFT-mode volumes with a +multi-member `CraftReplDev` RAFT group, so that volumes can be provisioned and survive +restarts without losing LSN state. + +**Acceptance criteria:** +- The existing solo `ReplDev` creation path in `homeblks_impl.cpp` is replaced with `CraftReplDev` for all volumes +- On `create_volume` with `craft` mode: create a multi-member RAFT group with the provided member endpoints +- `vol_sb_t` persists `commit_lsn` and `last_append_lsn` (or they are recoverable from the journal on restart) +- On `HomeBlocksImpl` restart: `CraftReplDev` recovers `{commit_lsn, last_append_lsn, term}` from the journal/superblock +- Member add/remove stubs (for future membership changes) +- Existing volume creation/removal lifecycle tests pass with CRAFT mode + +--- + +## S9 β€” CraftConnector +**Blocked by (skeleton):** nothing +**Blocked by (full handlers):** S2, S3, S7 + +As an I/O engineer, I want a `CraftConnector` class (analogous to `ScstConnector`) that +receives NubloxProto RPCs and translates them 1-to-1 to `CraftReplDev` API calls, so that +the RPC layer and storage layer have a clean boundary with no storage logic in the connector. + +**Acceptance criteria:** +- `CraftConnector` class exists; transport is pluggable (CRAFT-1 spike determines final choice) +- Client-facing handlers: `Login`, `Write`, `Read`, `Commit`, `KeepAlive`, `GetLSNs` +- Server-to-server handlers: `GetRSCommitLSN`, `FetchData` (used during login and resync) +- Each handler translates NubloxProto types ↔ `CraftReplDev` types with no storage logic +- Leader redirect: if a `Login` arrives at a follower, return leader endpoint +- Term mismatch: return `ETERM` to client +- Blocked on CRAFT-1 for the real transport; initial version uses direct function-call stubs +- Integration test: end-to-end Login + Write + Read + Commit through the connector + +--- + +## Out of scope for this epic + +- API/proto definition and RPC schema β†’ **SDSTOR-22297** +- RPC transport/framing selection β†’ **CRAFT-1 spike** +- Client-side quorum logic, partition management β†’ client-side epic +- CSI / ublk-nublox changes β†’ client-side epic diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 568d1b8..ad0b3b8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,16 +11,10 @@ find_package(sisl QUIET REQUIRED) list(APPEND COMMON_DEPS homestore::homestore sisl::sisl) -# This is a work-around for not being able to specify the link -# order in a conan recipe. We link these explicitly and thus -# need to specify the LINK path. They should only be needed -# to build a DSO (test executable) however. -link_directories(${spdk_LIB_DIRS} ${dpdk_LIB_DIRS}) list(APPEND COMMON_TEST_DEPS ${COMMON_DEPS} GTest::gmock - ${spdk_LIBRARY_LIST} - ${dpdk_LIBRARY_LIST} ) add_subdirectory(lib) +add_subdirectory(test) diff --git a/src/include/homeblks/home_blks.hpp b/src/include/homeblks/home_blks.hpp deleted file mode 100644 index 21fead6..0000000 --- a/src/include/homeblks/home_blks.hpp +++ /dev/null @@ -1,86 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#pragma once - -#include -#include -#include - -#include "common.hpp" - -namespace homeblocks { - -class VolumeManager; - -ENUM(DevType, uint8_t, AUTO_DETECT = 1, HDD, NVME, UNSUPPORTED); -struct device_info_t { - explicit device_info_t(std::string name, DevType dtype = DevType::AUTO_DETECT) : - path{std::filesystem::canonical(name)}, type{dtype} {} - device_info_t() = default; - bool operator==(device_info_t const& rhs) const { return path == rhs.path && type == rhs.type; } - friend std::istream& operator>>(std::istream& input, device_info_t& di) { - std::string i_path, i_type; - std::getline(input, i_path, ':'); - std::getline(input, i_type); - di.path = std::filesystem::canonical(i_path); - if (i_type == "HDD") { - di.type = DevType::HDD; - } else if (i_type == "NVME") { - di.type = DevType::NVME; - } else { - di.type = DevType::AUTO_DETECT; - } - return input; - } - std::filesystem::path path; - DevType type; -}; - -class HomeBlocksApplication { -public: - virtual ~HomeBlocksApplication() = default; - - virtual bool spdk_mode() const = 0; - virtual uint32_t threads() const = 0; - virtual std::list< device_info_t > devices() const = 0; - // in bytes; - virtual uint64_t app_mem_size() const = 0; - - // Callback made after determining if a SvcId exists or not during initialization, will consume response - virtual std::optional< peer_id_t > discover_svc_id(std::optional< peer_id_t > const& found) const = 0; -}; - -struct HomeBlocksStats { - uint64_t total_capacity_bytes{0}; - uint64_t used_capacity_bytes{0}; - std::string to_string() const { - return fmt::format("total_capacity_bytes={}, used_capacity_bytes={}", total_capacity_bytes, - used_capacity_bytes); - } -}; - -class HomeBlocks { -public: - virtual ~HomeBlocks() = default; - virtual peer_id_t our_uuid() const = 0; - virtual std::shared_ptr< VolumeManager > volume_manager() = 0; - virtual HomeBlocksStats get_stats() const = 0; - virtual iomgr::drive_type data_drive_type() const = 0; - virtual uint64_t max_vol_io_size() const = 0; - virtual void shutdown() = 0; -}; - -extern std::shared_ptr< HomeBlocks > init_homeblocks(std::weak_ptr< HomeBlocksApplication >&& application); -} // namespace homeblocks diff --git a/src/include/homeblks/home_blocks.hpp b/src/include/homeblks/home_blocks.hpp new file mode 100644 index 0000000..b972f82 --- /dev/null +++ b/src/include/homeblks/home_blocks.hpp @@ -0,0 +1,235 @@ +/********************************************************************************* + * Modifications Copyright 2026 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +// The complete homeblocks public API, in one header. Consumers include only this file. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include // sisl::sg_list +#include +#include +#include // result / async_result / status / ok + +// Declares the homeblocks logging module so consumers can wire its log level into their own logging init. The +// per-call LOG* shorthand macros are internal (lib/hb_internal.hpp) and intentionally not exposed here. +SISL_LOGGING_DECL(homeblocks) + +namespace homeblocks { + +using peer_id_t = boost::uuids::uuid; +using volume_id_t = boost::uuids::uuid; + +// homeblocks speaks homestore's one error surface: +// result == std::expected (synchronous) +// async_result == sisl::async::task> (a coroutine you co_await / sync_get) +// status / async_status carry no value on success (just ok()/error). +using homestore::async_result; +using homestore::async_status; +using homestore::ok; +using homestore::result; +using homestore::status; + +// =============================================== volumes =============================================== + +// Opaque volume handle. Produced by home_blocks::create_volume() / get_volume(); never constructed directly. +// All I/O is via the free async_* functions below. +class volume; +using volume_handle = std::shared_ptr< volume >; + +// homeblocks-specific failures only -- registered as a std::error_condition enum (bottom of this file) so they +// ride result while staying branchable: if (r.error() == volume_error::CRC_MISMATCH) { ... }. Anything with a +// standard equivalent (invalid arg, no space, io error, unsupported op, ...) is returned as +// std::make_error_condition(std::errc::*) directly rather than duplicated here. +ENUM(volume_error, uint16_t, UNKNOWN_VOLUME = 1, CRC_MISMATCH, INDEX_ERROR, INTERNAL_ERROR, OFFLINE); + +ENUM(volume_state, uint32_t, + INIT, // created, not yet online + ONLINE, // ready for I/O + OFFLINE, // not ready + DESTROYING, // teardown in progress (also used for destroy crash-recovery) + DESTROYED, // fully torn down + READONLY); + +// Selects the replication backend for a volume. +// DISABLED β€” existing ReplDev (raid1-style, data through RAFT log). +// CRAFT β€” new CraftReplDev (data via client broadcast; RAFT carries only sync-LSN and login entries). +ENUM(replication_mode, uint8_t, DISABLED = 1, CRAFT); + +struct volume_info { + volume_id_t id; + uint64_t size_bytes{0}; + uint64_t page_size{0}; // logical block size for this volume (a per-volume runtime setting) + std::string name; + uint64_t ordinal{0}; // internal: chunk-selector ordinal, assigned by homeblocks on create/recover + replication_mode repl_mode{replication_mode::DISABLED}; + + volume_info() = default; + volume_info(const volume_info&) = delete; + volume_info(volume_info&& rhs) noexcept : + id(rhs.id), + size_bytes(rhs.size_bytes), + page_size(rhs.page_size), + name(std::move(rhs.name)), + ordinal(rhs.ordinal), + repl_mode(rhs.repl_mode) {} + volume_info(volume_id_t id_in, uint64_t size, uint64_t psize, std::string in_name) : + id(id_in), size_bytes(size), page_size(psize), name(std::move(in_name)) {} + volume_info(volume_id_t id_in, uint64_t size, uint64_t psize, std::string in_name, uint64_t ord) : + id(id_in), size_bytes(size), page_size(psize), name(std::move(in_name)), ordinal(ord) {} + + auto operator<=>(volume_info const& rhs) const { + return boost::uuids::hash_value(id) <=> boost::uuids::hash_value(rhs.id); + } + auto operator==(volume_info const& rhs) const { return id == rhs.id; } + std::string to_string() const { + return fmt::format("volume_info: id={} size_bytes={}, page_size={}, name={} ordinal={} repl_mode={}", + boost::uuids::to_string(id), size_bytes, page_size, name, ordinal, + enum_name(repl_mode)); + } +}; + +struct volume_stats { + volume_id_t id; + volume_state state; +}; + +// ---- volume_error <-> std::error_condition registration ---- +class volume_error_category : public std::error_category { +public: + const char* name() const noexcept override { return "homeblocks.volume"; } + std::string message(int ev) const override { return std::string{enum_name(static_cast< volume_error >(ev))}; } +}; +inline std::error_category const& volume_error_category_inst() noexcept { + static volume_error_category inst; + return inst; +} +inline std::error_condition make_error_condition(volume_error e) noexcept { + return std::error_condition{static_cast< int >(e), volume_error_category_inst()}; +} + +// ---- I/O: free functions over a volume_handle ---- +// +// `addr` / `len` are RAW BYTE offsets into the volume, not block indices: the block size is a per-volume +// runtime setting, so bytes keep the contract unambiguous and confine the byte<->lba conversion to one place +// inside homeblocks. They must be block-aligned and in range, else the op resolves to std::errc::invalid_argument. +// +// Each returns a lazy coroutine: co_await it (or batch several with sisl::async::when_all) to run it. The +// result is [[nodiscard]] -- dropping the task means the I/O never issues. read/write resolve to the byte count +// transferred or a volume_error; the sg_list's iovecs point at caller-owned buffers that must outlive the await +// (the descriptor itself is copied into the coroutine frame), and its total length is the transfer size. +[[nodiscard]] async_result< size_t > async_read(volume_handle const& vol, uint64_t addr, sisl::sg_list sgs); +[[nodiscard]] async_result< size_t > async_write(volume_handle const& vol, uint64_t addr, sisl::sg_list sgs); +[[nodiscard]] async_status async_unmap(volume_handle const& vol, uint64_t addr, uint64_t len); + +// ============================================= home_blocks ============================================= + +struct home_blocks_stats { + uint64_t total_capacity_bytes{0}; + uint64_t used_capacity_bytes{0}; + std::string to_string() const { + return fmt::format("total_capacity_bytes={}, used_capacity_bytes={}", total_capacity_bytes, + used_capacity_bytes); + } +}; + +ENUM(dev_type, uint8_t, AUTO_DETECT = 1, HDD, NVME, UNSUPPORTED); + +struct device_info { + std::filesystem::path path; + dev_type type{dev_type::AUTO_DETECT}; + + device_info() = default; + explicit device_info(std::string name, dev_type t = dev_type::AUTO_DETECT) : + path{std::filesystem::canonical(name)}, type{t} {} + bool operator==(device_info const& rhs) const { return path == rhs.path && type == rhs.type; } + friend std::istream& operator>>(std::istream& input, device_info& di) { + std::string i_path, i_type; + std::getline(input, i_path, ':'); + std::getline(input, i_type); + di.path = std::filesystem::canonical(i_path); + if (i_type == "HDD") { + di.type = dev_type::HDD; + } else if (i_type == "NVME") { + di.type = dev_type::NVME; + } else { + di.type = dev_type::AUTO_DETECT; + } + return input; + } +}; + +// All configuration for bringing up a homeblocks instance. Passed by value to init_homeblocks(); designated +// initializers + defaults keep the trivial case a one-liner (mirrors homestore's hs_input_params{...}). +struct home_blocks_config { + std::vector< device_info > devices; // backing devices (required) + uint32_t threads{2}; // iomgr reactor count + uint64_t app_mem_size_mb{1024}; // memory budget (caches, etc.) + + // Cold-boot identity fetch. homeblocks invokes this exactly once -- and only when homestore comes up with no + // persisted svc id -- then sync_gets the returned coroutine OFF-reactor (init is a cold, non-reactor path). + // Resolve your (possibly rotated) OM client INSIDE the closure so the call uses the live one. The + // result may carry an error (e.g. OM unreachable), which fails init_homeblocks. Empty -> a random + // svc id is generated on first boot. + std::function< async_result< peer_id_t >() > on_svc_id{}; +}; + +// Opaque handle to a running homeblocks instance. Obtained only from init_homeblocks(). +class home_blocks { +public: + virtual ~home_blocks() = default; + + // --- instance --- + virtual peer_id_t our_uuid() const = 0; + virtual home_blocks_stats get_stats() const = 0; + virtual iomgr::drive_type data_drive_type() const = 0; + virtual uint64_t max_vol_io_size() const = 0; + virtual void shutdown() = 0; + + // --- volume control plane --- + // create_volume hands back the new volume; the handle is frequently discarded (get_volume retrieves it + // later, e.g. to bring a volume online on restart). The task is [[nodiscard]] regardless -- you must + // co_await / sync_get it or no work happens. + [[nodiscard]] virtual async_result< volume_handle > create_volume(volume_info info) = 0; + [[nodiscard]] virtual async_status remove_volume(volume_id_t const& id) = 0; + + // get_volume returns a ready-to-use handle for an existing (created or recovered) volume, or + // volume_error::UNKNOWN_VOLUME. + [[nodiscard]] virtual result< volume_handle > get_volume(volume_id_t const& id) const = 0; + [[nodiscard]] virtual result< volume_stats > get_stats(volume_id_t id) const = 0; + virtual std::vector< volume_id_t > volume_ids() const = 0; +}; + +// Bring up homeblocks. Operational failures (device open, format, on_svc_id RPC) -> result error; precondition +// bugs (e.g. no devices) assert. The returned handle owns the instance; drop it (or call shutdown()) to stop. +[[nodiscard]] result< std::shared_ptr< home_blocks > > init_homeblocks(home_blocks_config cfg); + +} // namespace homeblocks + +template <> +struct std::is_error_condition_enum< homeblocks::volume_error > : std::true_type {}; diff --git a/src/include/homeblks/volume_mgr.hpp b/src/include/homeblks/volume_mgr.hpp deleted file mode 100644 index 0bef760..0000000 --- a/src/include/homeblks/volume_mgr.hpp +++ /dev/null @@ -1,178 +0,0 @@ -#pragma once -#include -#include - -#include -#include -#include - -#include "common.hpp" - -namespace homeblocks { - -ENUM(VolumeError, uint16_t, UNKNOWN = 1, INVALID_ARG, TIMEOUT, UNKNOWN_VOLUME, UNSUPPORTED_OP, CRC_MISMATCH, - NO_SPACE_LEFT, DRIVE_WRITE_ERROR, INTERNAL_ERROR, INDEX_ERROR, VOLUME_OFFLINE); - -using lba_t = uint64_t; -using lba_count_t = uint32_t; - -class Volume; -using VolumePtr = shared< Volume >; - -// volume interface request should be freed only after IO is completed. -struct vol_interface_req : public sisl::ObjLifeCounter< vol_interface_req > { - uint8_t* buffer{nullptr}; - lba_t lba; - lba_count_t nlbas; - sisl::atomic_counter< int > refcount; - bool part_of_batch{false}; - uint64_t request_id; - VolumePtr vol{nullptr}; // back ref to the volume this request is associated with. - Clock::time_point io_start_time; // time when the request reaches homeblks. - Clock::time_point data_svc_start_time; // time when the request to data service starts. - Clock::time_point index_start_time; // time when the request to index service starts. - Clock::time_point journal_start_time; // time when the request to journal service starts. - - friend void intrusive_ptr_add_ref(vol_interface_req* req) { req->refcount.increment(1); } - friend void intrusive_ptr_release(vol_interface_req* req); - -public: - vol_interface_req(uint8_t* const buf, const uint64_t lba, const uint32_t nlbas, VolumePtr vol_ptr); - virtual ~vol_interface_req() = default; // override; sisl::ObjLifeCounter should have virtual destructor - virtual void free_yourself() { delete this; } - lba_t end_lba() const { return lba + nlbas - 1; } -}; - -using vol_interface_req_ptr = boost::intrusive_ptr< vol_interface_req >; - -struct VolumeInfo { - VolumeInfo() = default; - VolumeInfo(const VolumeInfo&) = delete; - VolumeInfo(VolumeInfo&& rhs) noexcept : - id(rhs.id), - size_bytes(rhs.size_bytes), - page_size(rhs.page_size), - name(std::move(rhs.name)), - ordinal(rhs.ordinal) {} - - VolumeInfo(volume_id_t id_in, uint64_t size, uint64_t psize, std::string in_name) : - id(id_in), size_bytes(size), page_size(psize), name(std::move(in_name)) {} - - VolumeInfo(volume_id_t id_in, uint64_t size, uint64_t psize, std::string in_name, uint64_t ord) : - id(id_in), size_bytes(size), page_size(psize), name(std::move(in_name)), ordinal(ord) {} - - volume_id_t id; - uint64_t size_bytes{0}; - uint64_t page_size{0}; - std::string name; - uint64_t ordinal = 0; - - auto operator<=>(VolumeInfo const& rhs) const { - return boost::uuids::hash_value(id) <=> boost::uuids::hash_value(rhs.id); - } - - auto operator==(VolumeInfo const& rhs) const { return id == rhs.id; } - - std::string to_string() { - return fmt::format("VolumeInfo: id={} size_bytes={}, page_size={}, name={} ordinal={}", - boost::uuids::to_string(id), size_bytes, page_size, name, ordinal); - } -}; - -ENUM(vol_state, uint32_t, - INIT, // initialized, but not ready online yet; - ONLINE, // online and ready to be used; - OFFLINE, // offline and not ready to be used; - DESTROYING, // being destroyed, this state will be used for vol-destroy crash recovery; - DESTROYED, // fully destroyed, currently not used, - // for future use of lazy-destroy, e.g. set destroyed and move forward, let the volume be destroyed in - // background; - READONLY // in read only mode; -); - -using VolumeInfoPtr = std::shared_ptr< VolumeInfo >; -struct VolumeStats { - volume_id_t id; - vol_state state; -#if 0 - // TODO: we don't maitain per volume stats right now, so these are not used. - // If there is a use case, we can enable this with support from repl dev layer; - uint64_t used_bytes; // total number of bytes used by all shards on this Volume; - uint64_t avail_bytes; // total number of bytes available on this Volume; - std::string to_string() { - return fmt::format("VolumeStats: id={} used_bytes={}, avail_bytes={}", boost::uuids::to_string(id), used_bytes, - avail_bytes); - } -#endif -}; - -class VolumeManager : public Manager< VolumeError > { -public: - virtual NullAsyncResult create_volume(VolumeInfo&& volume_info) = 0; - - virtual NullAsyncResult remove_volume(const volume_id_t& id) = 0; - - virtual VolumePtr lookup_volume(const volume_id_t& id) = 0; - - /** - * @brief Write the data to the volume asynchronously, created from the request. After completion the attached - * callback function will be called with this req ptr. - * - * @param vol Pointer to the volume - * @param req Request created which contains all the write parameters - * req.part_of_batch field can be used if this request is part of a batch request. If so, implementation can wait - * for batch_submit call before issuing the writes. IO might already be started or even completed (in case of - * errors) before batch_sumbit call, so application cannot assume IO will be started only after submit_batch call. - * - * @return std::error_condition no_error or error in issuing writes - */ - virtual NullAsyncResult write(const VolumePtr& vol, const vol_interface_req_ptr& req) = 0; - - /** - * @brief Read the data from the volume asynchronously, created from the request. After completion the attached - * callback function will be called with this req ptr. - * - * @param vol Pointer to the volume - * @param req Request created which contains all the read parameters - * req.part_of_batch field can be used if this request is part of a batch request. If so, implementation can wait - * for batch_submit call before issuing the reads. IO might already be started or even completed (in case of errors) - * before batch_sumbit call, so application cannot assume IO will be started only after submit_batch call. - * - * @return std::error_condition no_error or error in issuing reads - */ - virtual NullAsyncResult read(const VolumePtr& vol, const vol_interface_req_ptr& req) = 0; - - /** - * @brief unmap the given block range - * - * @param vol Pointer to the volume - * @param req Request created which contains all the read parameters - */ - virtual NullAsyncResult unmap(const VolumePtr& vol, const vol_interface_req_ptr& req) = 0; - - /** - * @brief Submit the io batch, which is a mandatory method to be called if read/write are issued with part_of_batch - * is set to true. In those cases, without this method, IOs might not be even issued. No-op if previous io requests - * are not part of batch. - */ - virtual void submit_io_batch() = 0; - - /** - * Retrieves the statistics for a specific Volume identified by its ID. - * - * @param id The ID of the Volume. - * @param stats The reference to the VolumeStats object where the statistics will be stored. - * @return True if the statistics were successfully retrieved, false otherwise (e.g. id not found). - */ - virtual bool get_stats(volume_id_t id, VolumeStats& stats) const = 0; - - /** - * @brief Retrieves the list of volume_ids. - * - * This function retrieves the list of volume_ids and stores them in the provided vector. - * - * @param vol_ids The vector to store the volume ids. - */ - virtual void get_volume_ids(std::vector< volume_id_t >& vol_ids) const = 0; -}; -} // namespace homeblocks diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 4d88dbe..fad3060 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -14,7 +14,6 @@ target_sources(${PROJECT_NAME}_core PRIVATE homeblks_impl.cpp volume_mgr.cpp listener.cpp - common.cpp ) target_link_libraries(${PROJECT_NAME}_core ${COMMON_DEPS} @@ -30,5 +29,6 @@ settings_gen_cpp( #add_subdirectory(homestore_backend) #add_subdirectory(memory_backend) +add_subdirectory(craft) add_subdirectory(volume) add_subdirectory(tests) diff --git a/src/lib/common.cpp b/src/lib/common.cpp deleted file mode 100644 index 6b37f52..0000000 --- a/src/lib/common.cpp +++ /dev/null @@ -1,23 +0,0 @@ - -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#include -#include - -namespace homeblocks { - -homestore::uuid_t hb_utils::gen_random_uuid() { return boost::uuids::random_generator()(); } - -} // namespace homeblocks diff --git a/src/lib/coro_helpers.hpp b/src/lib/coro_helpers.hpp new file mode 100644 index 0000000..7f1a361 --- /dev/null +++ b/src/lib/coro_helpers.hpp @@ -0,0 +1,70 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +// homeblocks coroutine bridge helpers -- the two places we cross between synchronous, non-coroutine code and +// the stdexec/sisl::async::task world +// - sync_get(task) : block the caller until the task completes and return its value. Safe only OFF a reactor (test +// mains, sync control-plane); blocking a reactor here would park its iomgr loop (see +// sync-get-on-reactor-deadlock note). +// - detach(task) : fire-and-forget a coroutine from a non-coroutine context (e.g. a void on_commit/timer +// callback). exec::task is lazy, so an un-awaited task never runs -- this starts it. +// +// homestore keeps an equivalent (internal, unexported) src/lib/common/coro_helpers.hpp; we keep our own minimal +// copy rather than depend on its internals. + +#include +#include +#include + +#include +#include + +#include +#include + +namespace homeblocks::detail { + +// Block the calling thread until the task completes and return its value (void for task). The task is +// fulfilled by other (reactor) threads; sync_wait drains a run_loop here. Do NOT call on an iomgr reactor. +template < typename Task > +inline auto sync_get(Task&& task) { + auto result = stdexec::sync_wait(std::forward< Task >(task)).value(); + if constexpr (std::tuple_size_v< decltype(result) > == 0) { + return; + } else { + return std::get< 0 >(std::move(result)); + } +} + +// Fire-and-forget a coroutine whose result we don't need. The task is taken by value (copied into the +// self-owning wrapper frame); the wrapper swallows exceptions so a throwing body can't reach start_detached's +// receiver (which would std::terminate) -- tasks normally complete errors-as-values, so this is a backstop. +// write_env injects an inline scheduler so the sticky-affinity exec::task can start without an enclosing +// scheduler (it resumes inline on whatever thread completes its awaited work) -- same idiom as sisl's when_all. +template < typename T > +inline void detach(sisl::async::task< T > task) { + auto wrapper = [](sisl::async::task< T > t) -> sisl::async::task< void > { + try { + co_await std::move(t); + } catch (const std::exception& e) { LOGERROR("Detached task threw, swallowing: {}", e.what()); } catch (...) { + LOGERROR("Detached task threw an unknown exception, swallowing"); + } + }(std::move(task)); + stdexec::start_detached( + stdexec::write_env(std::move(wrapper), stdexec::prop{stdexec::get_scheduler, exec::inline_scheduler{}})); +} + +} // namespace homeblocks::detail diff --git a/src/lib/craft/CMakeLists.txt b/src/lib/craft/CMakeLists.txt new file mode 100644 index 0000000..45ec4f5 --- /dev/null +++ b/src/lib/craft/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required (VERSION 3.11) + +add_library(${PROJECT_NAME}_craft OBJECT) +target_sources(${PROJECT_NAME}_craft PRIVATE + craft_repl_dev.cpp +) +target_link_libraries(${PROJECT_NAME}_craft + ${COMMON_DEPS} +) \ No newline at end of file diff --git a/src/lib/craft/craft_repl_dev.cpp b/src/lib/craft/craft_repl_dev.cpp new file mode 100644 index 0000000..34ce8ad --- /dev/null +++ b/src/lib/craft/craft_repl_dev.cpp @@ -0,0 +1,110 @@ +/********************************************************************************* + * Modifications Copyright 2026 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ + +#include "craft_repl_dev.hpp" + +namespace homeblocks { + +// ─── constructor ────────────────────────────────────────────────────────────── + +CraftReplDev::CraftReplDev(volume_id_t vol_id, unique< CraftJournalBackend > journal) + : vol_id_{vol_id}, journal_{std::move(journal)}, raft_listener_{this} {} + +// ─── get_lsns / get_rs_commit_lsn ──────────────────────────────────────────── +// These are real: they just snapshot the in-memory partition state. + +async_result< LSNPair > CraftReplDev::get_lsns(volume_id_t /* vol_id */) { + co_return LSNPair{state_.commit_lsn, state_.last_append_lsn}; +} + +async_result< LSNPair > CraftReplDev::get_rs_commit_lsn() { + co_return LSNPair{state_.commit_lsn, state_.last_append_lsn}; +} + +// ─── stubs (S2–S7 will implement these) ────────────────────────────────────── + +async_result< LoginResult > CraftReplDev::login(uint64_t /* client_token */, + volume_id_t /* vol_id */) { + LOGW("CraftReplDev::login not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +async_status CraftReplDev::write(uint64_t /* term */, int64_t /* lsn */, int64_t /* glsn */, + lba_t /* lba */, lba_count_t /* len */, + sisl::sg_list /* data */) { + LOGW("CraftReplDev::write not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +async_result< sisl::sg_list > CraftReplDev::read(uint64_t /* term */, + int64_t /* min_commit_lsn */, + lba_t /* lba */, lba_count_t /* len */) { + LOGW("CraftReplDev::read not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +async_status CraftReplDev::commit(uint64_t /* term */, int64_t /* lsn */) { + LOGW("CraftReplDev::commit not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +async_status CraftReplDev::keep_alive(int64_t /* commit_lsn */) { + LOGW("CraftReplDev::keep_alive not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +async_status CraftReplDev::truncate(int64_t /* lsn */) { + LOGW("CraftReplDev::truncate not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +async_status CraftReplDev::append(int64_t /* sync_to */, uint64_t /* client_token */) { + LOGW("CraftReplDev::append not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +async_result< std::vector< JournalSlot > > +CraftReplDev::fetch_data(std::vector< int64_t > /* lsns */) { + LOGW("CraftReplDev::fetch_data not yet implemented"); + co_return std::unexpected(std::make_error_condition(std::errc::not_supported)); +} + +// ─── RAFT listener ──────────────────────────────────────────────────────────── + +void CraftReplDev::CraftRaftListener::on_commit( + int64_t lsn, sisl::blob const& /* header */, sisl::blob const& /* key */, + std::vector< homestore::multi_blk_id > const& /* blkids */, + cintrusive< homestore::repl_req_ctx >& /* ctx */) { + // S5 will parse the entry type from `header` and dispatch to + // owner_->apply_sync_rs_commit_lsn() or owner_->apply_internal_login(). + LOGD("CraftRaftListener::on_commit lsn={} (entry dispatch not yet implemented)", lsn); +} + +// ─── RAFT apply helpers (S5 will implement) ─────────────────────────────────── + +void CraftReplDev::apply_sync_rs_commit_lsn(int64_t rs_commit_lsn, + uint64_t /* client_token */) { + // Advance commit_lsn to rs_commit_lsn, calling fetch_data() for any missing slots. + LOGD("apply_sync_rs_commit_lsn rs_commit_lsn={} (not yet implemented)", rs_commit_lsn); +} + +void CraftReplDev::apply_internal_login(uint64_t client_token, uint64_t term) { + // Update state_.client_token and state_.term; subsequent IOs with a different + // term will be rejected with ETERM. + LOGD("apply_internal_login client_token={} term={} (not yet implemented)", client_token, + term); +} + +} // namespace homeblocks \ No newline at end of file diff --git a/src/lib/craft/craft_repl_dev.hpp b/src/lib/craft/craft_repl_dev.hpp new file mode 100644 index 0000000..986c89b --- /dev/null +++ b/src/lib/craft/craft_repl_dev.hpp @@ -0,0 +1,202 @@ +/********************************************************************************* + * Modifications Copyright 2026 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include "../hb_internal.hpp" +#include + +#include +#include +#include +#include + +namespace homeblocks { + +// ─── wire-protocol types ────────────────────────────────────────────────────── + +// Network address of a replica returned by login(). +struct replica_endpoint { + peer_id_t id; + std::string addr; // "host:port" +}; + +// Per-partition in-memory state. Authoritative in memory; recovered from +// journal + superblock on restart. +struct CraftPartitionState { + int64_t commit_lsn {-1}; // highest committed dLSN (applied to LBA index) + int64_t last_append_lsn {-1}; // highest appended dLSN (may be uncommitted) + uint64_t client_token {0}; // token from the last successful InternalLogin + uint64_t term {0}; // current RAFT term +}; + +// Returned by get_lsns() / get_rs_commit_lsn(). +struct LSNPair { + int64_t commit_lsn; + int64_t last_append_lsn; +}; + +// Returned by login(). +struct LoginResult { + std::vector< replica_endpoint > members; + int64_t dLSN; // starting LSN for new I/O + int64_t gLSN; // global (volume-level) LSN + uint64_t term; +}; + +// One journal slot, returned by fetch_data(). is_empty == true means the LSN +// never reached any replica and the data fields are invalid. +struct JournalSlot { + int64_t lsn; + bool is_empty{false}; + lba_t lba{0}; + lba_count_t len{0}; + sisl::sg_list data; +}; + +// ─── journal backend abstraction ───────────────────────────────────────────── +// +// Injected into CraftReplDev so unit tests can supply a mock without touching +// HomeStore. Production code passes HomeStoreCraftJournalBackend (defined in +// craft_repl_dev.cpp). + +class CraftJournalBackend { +public: + virtual async_status write_slot(int64_t lsn, lba_t lba, lba_count_t len, + sisl::sg_list data) = 0; + virtual async_result read_slot(int64_t lsn) = 0; + virtual async_status truncate_to(int64_t lsn) = 0; + virtual ~CraftJournalBackend() = default; +}; + +// ─── CraftReplDev ───────────────────────────────────────────────────────────── +// +// Parallel to HomeStore's ReplDisk. Each CRAFT-mode volume owns one instance +// instead of the solo repl_dev. Non-CRAFT volumes are unaffected. + +class CraftReplDev { +public: + explicit CraftReplDev(volume_id_t vol_id, unique< CraftJournalBackend > journal); + ~CraftReplDev() = default; + + // ── client-facing ────────────────────────────────────────────────────── + + // Full login sequence (leader-only). Serialized: at most one in-flight + // login per partition at a time. + async_result< LoginResult > login(uint64_t client_token, volume_id_t vol_id); + + // Append data to the journal at the client-assigned LSN slot. Zero-copy; + // does NOT apply data to the LBA index. + async_status write(uint64_t term, int64_t lsn, int64_t glsn, + lba_t lba, lba_count_t len, sisl::sg_list data); + + // Inline-commit up to min_commit_lsn if needed, then serve from the LBA index. + async_result< sisl::sg_list > read(uint64_t term, int64_t min_commit_lsn, + lba_t lba, lba_count_t len); + + // Apply journal entries (current_commit, lsn] to the LBA index. + async_status commit(uint64_t term, int64_t lsn); + + // Same as commit + reset the client-timeout watchdog. + async_status keep_alive(int64_t commit_lsn); + + // ── internal / peer API ──────────────────────────────────────────────── + + // Return {commit_lsn, last_append_lsn} for the local partition. + async_result< LSNPair > get_lsns(volume_id_t vol_id); + + // Alias of get_lsns exposed to peer servers during GetRSCommitLSN broadcast. + async_result< LSNPair > get_rs_commit_lsn(); + + // Drop all journal entries with dLSN > lsn; clear missing-set entries above lsn. + async_status truncate(int64_t lsn); + + // Propose a SyncRSCommitLSN RAFT entry (called by watchdog or leader during login). + async_status append(int64_t sync_to, uint64_t client_token); + + // Return raw journal data for the requested LSNs. Empty slots return + // JournalSlot{.is_empty=true} rather than an error. + async_result< std::vector< JournalSlot > > fetch_data(std::vector< int64_t > lsns); + +private: + // ── RAFT listener ────────────────────────────────────────────────────── + // + // Handles the two CRAFT RAFT entry types (SyncRSCommitLSN, InternalLogin). + // All other HomeStore callbacks are no-ops for this backend. + + class CraftRaftListener : public homestore::repl_dev_listener { + public: + explicit CraftRaftListener(CraftReplDev* owner) : owner_{owner} {} + + // Dispatches on entry type; the real work is in the two apply_* helpers below. + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< homestore::multi_blk_id > const& blkids, + cintrusive< homestore::repl_req_ctx >& ctx) override; + + // ── no-ops ──────────────────────────────────────────────────────── + bool on_pre_commit(int64_t, const sisl::blob&, const sisl::blob&, + cintrusive< homestore::repl_req_ctx >&) override { return true; } + void on_error(homestore::ReplServiceError, const sisl::blob&, const sisl::blob&, + cintrusive< homestore::repl_req_ctx >&) override {} + homestore::result< homestore::blk_alloc_hints > + get_blk_alloc_hints(sisl::blob const&, uint32_t, + cintrusive< homestore::repl_req_ctx >&) override { + return homestore::blk_alloc_hints{}; + } + void on_destroy(const homestore::group_id_t&) override {} + void on_start_replace_member(const std::string&, const homestore::replica_member_info&, + const homestore::replica_member_info&, + homestore::trace_id_t) override {} + void on_complete_replace_member(const std::string&, const homestore::replica_member_info&, + const homestore::replica_member_info&, + homestore::trace_id_t) override {} + void on_clean_replace_member_task(const std::string&, const homestore::replica_member_info&, + const homestore::replica_member_info&, + homestore::trace_id_t) override {} + void on_remove_member(const homestore::replica_id_t&, homestore::trace_id_t) override {} + void on_rollback(int64_t, const sisl::blob&, const sisl::blob&, + cintrusive< homestore::repl_req_ctx >&) override {} + void on_restart() override {} + homestore::async_status + create_snapshot(std::shared_ptr< homestore::snapshot_context >) override { co_return homestore::ok(); } + bool apply_snapshot(std::shared_ptr< homestore::snapshot_context >) override { return true; } + std::shared_ptr< homestore::snapshot_context > last_snapshot() override { return nullptr; } + int read_snapshot_obj(std::shared_ptr< homestore::snapshot_context >, + std::shared_ptr< homestore::snapshot_obj >) override { return 0; } + void write_snapshot_obj(std::shared_ptr< homestore::snapshot_context >, + std::shared_ptr< homestore::snapshot_obj >) override {} + void free_user_snp_ctx(void*&) override {} + void on_no_space_left(homestore::repl_lsn_t, sisl::blob const&) override {} + void notify_committed_lsn(int64_t) override {} + void on_config_rollback(int64_t) override {} + + private: + CraftReplDev* owner_; + }; + + // Called from CraftRaftListener::on_commit after deserialising the entry type. + void apply_sync_rs_commit_lsn(int64_t rs_commit_lsn, uint64_t client_token); + void apply_internal_login(uint64_t client_token, uint64_t term); + + volume_id_t vol_id_; + unique< CraftJournalBackend > journal_; + CraftPartitionState state_; + std::set< int64_t > missing_lsns_; // gaps between commit_lsn and last_append_lsn + std::mutex missing_mu_; + bool login_in_progress_{false}; + std::mutex login_mu_; + CraftRaftListener raft_listener_; +}; + +} // namespace homeblocks \ No newline at end of file diff --git a/src/include/homeblks/common.hpp b/src/lib/hb_internal.hpp similarity index 66% rename from src/include/homeblks/common.hpp rename to src/lib/hb_internal.hpp index 4679505..d4f4361 100644 --- a/src/include/homeblks/common.hpp +++ b/src/lib/hb_internal.hpp @@ -1,4 +1,3 @@ - /********************************************************************************* * Modifications Copyright 2017-2019 eBay Inc. * @@ -15,20 +14,17 @@ *********************************************************************************/ #pragma once -#include +// Internal homeblocks prelude: the public API (home_blocks.hpp) plus implementation-only logging shorthand, +// size constants, and convenience aliases. NOT part of the public surface -- consumers include only +// . Implementation TUs/headers include this instead. -#include #include -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include -#pragma GCC diagnostic pop +#include #include #include #include -SISL_LOGGING_DECL(homeblocks); +#include #define HOMEBLOCKS_LOG_MODS homeblocks #define LOGT(...) LOGTRACEMOD(homeblocks, ##__VA_ARGS__) @@ -49,46 +45,23 @@ constexpr uint64_t Gi = Ki * Mi; #endif namespace homeblocks { -using peer_id_t = boost::uuids::uuid; -using volume_id_t = boost::uuids::uuid; -template < typename T > -using shared = std::shared_ptr< T >; +using lba_t = uint64_t; +using lba_count_t = uint32_t; -template < typename T > +template < class T > +using shared = std::shared_ptr< T >; +template < class T > using cshared = const std::shared_ptr< T >; - -template < typename T > +template < class T > using unique = std::unique_ptr< T >; - -template < typename T > +template < class T > using intrusive = boost::intrusive_ptr< T >; - -template < typename T > +template < class T > using cintrusive = const boost::intrusive_ptr< T >; - -template < typename T > +template < class T > using superblk = homestore::superblk< T >; -template < class E > -class Manager { -public: - template < typename T > - using Result = std::expected< T, E >; - template < typename T > - using AsyncResult = folly::Future< Result< T > >; - - using NullResult = Result< void >; - using NullAsyncResult = AsyncResult< void >; - - virtual ~Manager() = default; -}; - -class hb_utils { -public: - static homestore::uuid_t gen_random_uuid(); -}; - static constexpr uint32_t MAX_NUM_VOLUMES = 2048; } // namespace homeblocks diff --git a/src/lib/homeblks_impl.cpp b/src/lib/homeblks_impl.cpp index 27dae43..60c9f85 100644 --- a/src/lib/homeblks_impl.cpp +++ b/src/lib/homeblks_impl.cpp @@ -17,24 +17,24 @@ #include #include #include +#include #include #include #include #include "homeblks_impl.hpp" #include "listener.hpp" +#include "coro_helpers.hpp" #include "home_blks_config.hpp" -SISL_OPTION_GROUP(homeblocks, - (executor_type, "", "executor", "Executor to use for Future deferal", - ::cxxopts::value< std::string >()->default_value("immediate"), "immediate|cpu|io")); - SISL_LOGGING_DEF(HOMEBLOCKS_LOG_MODS) namespace homeblocks { -extern std::shared_ptr< HomeBlocks > init_homeblocks(std::weak_ptr< HomeBlocksApplication >&& application) { - LOGI("Initializing HomeBlocks with reaper thread timer: {} seconds", HB_DYNAMIC_CONFIG(reaper_thread_timer_secs)); - auto inst = std::make_shared< HomeBlocksImpl >(std::move(application)); +result< shared< home_blocks > > init_homeblocks(home_blocks_config cfg) { + LOGI("Initializing home_blocks with reaper thread timer: {} seconds", HB_DYNAMIC_CONFIG(reaper_thread_timer_secs)); + auto inst = std::make_shared< HomeBlocksImpl >(std::move(cfg)); + // TODO: surface operational init failures (on_svc_id RPC, device open, format) as result<> errors rather than + // aborting inside init_homestore(); precondition bugs (e.g. no devices) stay asserts. inst->init_homestore(); inst->init_cp(); inst->start_reaper_thread(); @@ -53,16 +53,16 @@ iomgr::drive_type HomeBlocksImpl::data_drive_type() const { } } -HomeBlocksStats HomeBlocksImpl::get_stats() const { +home_blocks_stats HomeBlocksImpl::get_stats() const { auto const stats = homestore::hs()->repl_service().get_cap_stats(); return {stats.total_capacity, stats.used_capacity}; } -folly::Future< folly::Unit > HomeBlocksImpl::shutdown_start() { +std::future< void > HomeBlocksImpl::shutdown_start() { LOGI("Setting shutdown start flag"); - shutdown_started_ = true; + shutdown_started_.test_and_set(); - auto f = shutdown_promise_.getFuture(); + auto f = shutdown_promise_.get_future(); auto const nsecs = shutdown_timer_nsecs(); LOGI("Setting shutdown timer with {} seconds", nsecs); @@ -129,7 +129,7 @@ void HomeBlocksImpl::do_shutdown() { if (can_shutdown()) { LOGI("No outstanding requests, proceeding with shutdown"); - shutdown_promise_.setValue(); + shutdown_promise_.set_value(); } else { LOGI("Outstanding requests exist, will retry shutdown in {} seconds", shutdown_timer_nsecs()); } @@ -142,7 +142,7 @@ void HomeBlocksImpl::shutdown() { // start timer thread if there are still outstanding jobs; auto f = shutdown_start(); - std::move(f).get(); + f.get(); // stop the timer thread if (shutdown_timer_hdl_ != iomgr::null_timer_handle) { @@ -155,42 +155,28 @@ void HomeBlocksImpl::shutdown() { sb_.write(); homestore::hs()->shutdown(); - homestore::HomeStore::reset_instance(); + homestore::home_store::reset_instance(); iomanager.stop(); reset_instance(); } -HomeBlocksImpl::HomeBlocksImpl(std::weak_ptr< HomeBlocksApplication >&& application) : - _application(std::move(application)), sb_{HB_META_NAME} { - auto exe_type = SISL_OPTIONS["executor"].as< std::string >(); - std::transform(exe_type.begin(), exe_type.end(), exe_type.begin(), ::tolower); - - if ("immediate" == exe_type) [[likely]] - executor_ = &folly::QueuedImmediateExecutor::instance(); - else if ("io" == exe_type) - executor_ = folly::getGlobalIOExecutor(); - else if ("cpu" == exe_type) - executor_ = folly::getGlobalCPUExecutor(); - else - RELEASE_ASSERT(false, "Unknown Folly Executor type: [{}]", exe_type); - LOGI("initialized with [executor={}]", exe_type); +HomeBlocksImpl::HomeBlocksImpl(home_blocks_config&& cfg) : config_(std::move(cfg)), sb_{HB_META_NAME} { ordinal_reserver_ = std::make_unique< sisl::IDReserver >(MAX_NUM_VOLUMES); } -DevType HomeBlocksImpl::get_device_type(std::string const& devname) { - const iomgr::drive_type dtype = iomgr::DriveInterface::get_drive_type(devname); - if (dtype == iomgr::drive_type::block_hdd || dtype == iomgr::drive_type::file_on_hdd) { return DevType::HDD; } - if (dtype == iomgr::drive_type::file_on_nvme || dtype == iomgr::drive_type::block_nvme) { return DevType::NVME; } - return DevType::UNSUPPORTED; +dev_type HomeBlocksImpl::get_device_type(std::string const& devname) { + const iomgr::drive_type dtype = iomgr::type_of(devname); + if (dtype == iomgr::drive_type::block_hdd || dtype == iomgr::drive_type::file_on_hdd) { return dev_type::HDD; } + if (dtype == iomgr::drive_type::file_on_nvme || dtype == iomgr::drive_type::block_nvme) { return dev_type::NVME; } + return dev_type::UNSUPPORTED; } // repl application to init homestore -class HBReplApp : public homestore::ReplApplication { +class HBReplApp : public homestore::repl_application { public: - HBReplApp(homestore::repl_impl_type impl_type, bool tl_consistency, HomeBlocksImpl* hb, - std::weak_ptr< HomeBlocksApplication > ho_app) : - impl_type_(impl_type), tl_consistency_(tl_consistency), hb_(hb), ho_app_(ho_app) {} + HBReplApp(homestore::repl_impl_type impl_type, bool tl_consistency, HomeBlocksImpl* hb) : + impl_type_(impl_type), tl_consistency_(tl_consistency), hb_(hb) {} // TODO: make this override after the base class in homestore adds a virtual destructor virtual ~HBReplApp() = default; @@ -201,7 +187,7 @@ class HBReplApp : public homestore::ReplApplication { bool need_timeline_consistency() const override { return tl_consistency_; } // this will be called by homestore when create_repl_dev is called; - std::shared_ptr< homestore::ReplDevListener > create_repl_dev_listener(homestore::group_id_t group_id) override { + std::shared_ptr< homestore::repl_dev_listener > create_repl_dev_listener(homestore::group_id_t group_id) override { return std::make_shared< HBListener >(hb_); #if 0 std::scoped_lock lock_guard(_repl_sm_map_lock); @@ -231,29 +217,28 @@ class HBReplApp : public homestore::ReplApplication { homestore::repl_impl_type impl_type_; bool tl_consistency_; // indicates whether this application needs timeline consistency; HomeBlocksImpl* hb_; - std::weak_ptr< HomeBlocksApplication > ho_app_; #if 0 std::map< homestore::group_id_t, std::shared_ptr< HBListener> > _repl_sm_map; std::mutex _repl_sm_map_lock; #endif }; -void HomeBlocksImpl::get_dev_info(shared< HomeBlocksApplication > app, std::vector< homestore::dev_info >& dev_info, - bool& has_data_dev, bool& has_fast_dev) { - for (auto const& dev : app->devices()) { +void HomeBlocksImpl::get_dev_info(std::vector< homestore::dev_info >& dev_info, bool& has_data_dev, + bool& has_fast_dev) { + for (auto const& dev : config_.devices) { auto input_dev_type = dev.type; auto detected_type = get_device_type(dev.path.string()); LOGD("Device {} detected as {}", dev.path.string(), detected_type); - auto final_type = (dev.type == DevType::AUTO_DETECT) ? detected_type : input_dev_type; - if (final_type == DevType::UNSUPPORTED) { + auto final_type = (dev.type == dev_type::AUTO_DETECT) ? detected_type : input_dev_type; + if (final_type == dev_type::UNSUPPORTED) { LOGW("Device {} is not supported, skipping", dev.path.string()); continue; } - if (input_dev_type != DevType::AUTO_DETECT && detected_type != final_type) { + if (input_dev_type != dev_type::AUTO_DETECT && detected_type != final_type) { LOGW("Device {} detected as {}, but input type is {}, using input type", dev.path.string(), detected_type, input_dev_type); } - auto hs_type = (final_type == DevType::HDD) ? homestore::HSDevType::Data : homestore::HSDevType::Fast; + auto hs_type = (final_type == dev_type::HDD) ? homestore::HSDevType::Data : homestore::HSDevType::Fast; if (hs_type == homestore::HSDevType::Data) { has_data_dev = true; } if (hs_type == homestore::HSDevType::Fast) { has_fast_dev = true; } dev_info.emplace_back(std::filesystem::canonical(dev.path).string(), hs_type); @@ -291,19 +276,15 @@ hs_chunk_size_cfg_t HomeBlocksImpl::get_chunk_size() const { } void HomeBlocksImpl::init_homestore() { - auto app = _application.lock(); - RELEASE_ASSERT(app, "HomeObjectApplication lifetime unexpected!"); - - LOGI("Starting iomgr with {} threads, spdk: {}", app->threads(), false); - ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = app->threads(), .is_spdk = app->spdk_mode()}) - .with_http_server(); + LOGI("Starting iomgr with {} threads", config_.threads); + ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = config_.threads}).with_http_server(); - const uint64_t app_mem_size = app->app_mem_size() * 1024 * 1024 * 1024; + const uint64_t app_mem_size = config_.app_mem_size_mb * 1024 * 1024; LOGI("Initialize and start HomeStore with app_mem_size = {}", app_mem_size); std::vector< homestore::dev_info > device_info; bool has_data_dev{false}, has_fast_dev{false}; - get_dev_info(app, device_info, has_data_dev, has_fast_dev); + get_dev_info(device_info, has_data_dev, has_fast_dev); RELEASE_ASSERT(device_info.size() != 0, "No supported devices found!"); @@ -319,8 +300,7 @@ void HomeBlocksImpl::init_homestore() { using namespace homestore; // Note: timeline_consistency doesn't matter as we are using solo repl dev; - auto repl_app = - std::make_shared< HBReplApp >(repl_impl_type::solo, false /*timeline_consistency*/, this, _application); + auto repl_app = std::make_shared< HBReplApp >(repl_impl_type::solo, false /*timeline_consistency*/, this); bool need_format = false; if (fc_on()) { need_format = homestore::hs() @@ -341,9 +321,15 @@ void HomeBlocksImpl::init_homestore() { auto const hs_chunk_sz = get_chunk_size(); if (need_format) { - auto ret = app->discover_svc_id(std::nullopt); - DEBUG_ASSERT(ret.has_value(), "UUID should be generated by application."); - our_uuid_ = ret.value(); + // Cold boot: ask the consumer's hook for our svc id (e.g. a gRPC to the OM), driven off-reactor here on + // the init thread. Empty hook -> generate one. + if (config_.on_svc_id) { + auto ret = detail::sync_get(config_.on_svc_id()); + RELEASE_ASSERT(ret.has_value(), "on_svc_id hook failed to provide a svc id"); + our_uuid_ = ret.value(); + } else { + our_uuid_ = boost::uuids::random_generator()(); + } LOGINFO("We are starting for the first time on svc_id: [{}]. Formatting HomeStore. ", boost::uuids::to_string(our_uuid())); if (has_data_dev && has_fast_dev) { @@ -394,8 +380,8 @@ void HomeBlocksImpl::init_homestore() { } else { // we are starting on an existing system; DEBUG_ASSERT(our_uuid() != boost::uuids::nil_uuid(), "UUID should be recovered from HB superblock!"); - // now callback to application to nofity the uuid so that we are treated as an existing system; - app->discover_svc_id(our_uuid()); + // Reboot: svc id recovered from the superblock; on_svc_id is cold-boot-only, so nothing to call here. A + // consumer that needs to re-announce on restart reads home_blocks::our_uuid() after init. LOGINFO("We are starting on [{}].", boost::uuids::to_string(our_uuid_)); } @@ -414,7 +400,7 @@ void HomeBlocksImpl::superblk_init() { } void HomeBlocksImpl::on_hb_meta_blk_found(sisl::byte_view const& buf, void* cookie) { - sb_.load(buf, cookie); + sb_.load(buf, static_cast< homestore::meta_blk* >(cookie)); // sb verification RELEASE_ASSERT_EQ(sb_->version, HB_SB_VER); RELEASE_ASSERT_EQ(sb_->magic, HB_SB_MAGIC); @@ -447,7 +433,7 @@ void HomeBlocksImpl::register_metablk_cb() { homestore::hs()->meta_service().register_handler( HB_META_NAME, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { - on_hb_meta_blk_found(std::move(buf), voidptr_cast(mblk)); + on_hb_meta_blk_found(std::move(buf), reinterpret_cast< void* >(mblk)); }, nullptr /*recovery_comp_cb*/, true /* do_crc */); } @@ -457,16 +443,16 @@ void HomeBlocksImpl::on_init_complete() { // Add anything that needs to be done here. using namespace homestore; - // Volume SB + // volume SB homestore::hs()->meta_service().register_handler( - Volume::VOL_META_NAME, + volume::VOL_META_NAME, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { - on_vol_meta_blk_found(std::move(buf), voidptr_cast(mblk)); + on_vol_meta_blk_found(std::move(buf), reinterpret_cast< void* >(mblk)); }, nullptr /*recovery_comp_cb*/, true /* do_crc */, std::optional< meta_subtype_vec_t >({homestore::hs()->repl_service().get_meta_blk_name()})); - homestore::hs()->meta_service().read_sub_sb(Volume::VOL_META_NAME); + homestore::hs()->meta_service().read_sub_sb(volume::VOL_META_NAME); } void HomeBlocksImpl::init_cp() {} @@ -492,7 +478,7 @@ void HomeBlocksImpl::start_reaper_thread() { void HomeBlocksImpl::vol_gc() { LOGI("Running volume garbage collection"); // loop through every volume and call remove volume if volume's ref_cnt is zero; - std::vector< VolumePtr > vols_to_remove; + std::vector< volume_handle > vols_to_remove; { auto lg = std::shared_lock(vol_lock_); for (auto& vol_pair : vol_map_) { @@ -511,7 +497,9 @@ void HomeBlocksImpl::vol_gc() { for (auto& vol : vols_to_remove) { LOGI("Garbage Collecting removed volume with id: {}", vol->id_str()); - remove_volume(vol->id()); + // remove_volume is a coroutine (async_status) whose actual work is scheduled on a worker; we don't await + // it here, so start it fire-and-forget (an un-awaited lazy task would otherwise never run). + detail::detach(remove_volume(vol->id())); } } @@ -524,9 +512,9 @@ bool HomeBlocksImpl::fc_on() const { return HB_DYNAMIC_CONFIG(fault_containment_on); } -void HomeBlocksImpl::exit_fc(VolumePtr& vol) { vol->state_change(vol_state::ONLINE); } +void HomeBlocksImpl::exit_fc(volume_handle& vol) { vol->state_change(volume_state::ONLINE); } -void HomeBlocksImpl::fault_containment(const VolumePtr vol, const std::string& reason) { +void HomeBlocksImpl::fault_containment(const volume_handle vol, const std::string& reason) { if (vol == nullptr) { // Put entire HB into offline; std::scoped_lock lg(sb_lock_); @@ -537,9 +525,9 @@ void HomeBlocksImpl::fault_containment(const VolumePtr vol, const std::string& r return; } - LOGI("Volume {} is in fault containment due to: {}", vol->id_str(), reason); + LOGI("volume {} is in fault containment due to: {}", vol->id_str(), reason); // if volume is in fault containment, we should not allow any new requests to be issued on it; - vol->state_change(vol_state::OFFLINE); + vol->state_change(volume_state::OFFLINE); } shared< HomeBlocksImpl > HomeBlocksImpl::s_instance_ = nullptr; diff --git a/src/lib/homeblks_impl.hpp b/src/lib/homeblks_impl.hpp index cefeb19..ec6f800 100644 --- a/src/lib/homeblks_impl.hpp +++ b/src/lib/homeblks_impl.hpp @@ -15,32 +15,37 @@ *********************************************************************************/ #pragma once +#include +#include #include #include #include #include #include -#include +#include #include #include #include #include -#include -#include -#include +#include "hb_internal.hpp" #include "volume/volume.hpp" #include "volume/volume_chunk_selector.hpp" namespace homeblocks { -class Volume; +class volume; struct hs_chunk_size_cfg_t { uint64_t index; uint64_t data; }; -class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enable_shared_from_this< HomeBlocksImpl > { +class HomeBlocksImpl : public home_blocks, public std::enable_shared_from_this< HomeBlocksImpl > { + // The free data-plane functions need the system-level guards (restricted / shutting-down / fake-io delay). + friend async_result< size_t > async_read(volume_handle const&, uint64_t, sisl::sg_list); + friend async_result< size_t > async_write(volume_handle const&, uint64_t, sisl::sg_list); + friend async_status async_unmap(volume_handle const&, uint64_t, uint64_t); + struct homeblks_sb_t { uint64_t magic; uint32_t version; @@ -65,13 +70,12 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab static constexpr uint64_t MAX_VOL_IO_SIZE = 1 * Mi; // 1 MiB private: - /// Our SvcId retrieval and SvcId->IP mapping - std::weak_ptr< HomeBlocksApplication > _application; - folly::Executor::KeepAlive<> executor_; + /// bring-up configuration (devices, threads, app_mem_size, on_svc_id hook) + home_blocks_config config_; - /// Volume management + /// volume management mutable std::shared_mutex vol_lock_; - std::map< volume_id_t, VolumePtr > vol_map_; + std::map< volume_id_t, volume_handle > vol_map_; // index table map which only used during recovery; mutable std::shared_mutex index_lock_; @@ -87,10 +91,10 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab std::unique_ptr< sisl::IDReserver > ordinal_reserver_; sisl::atomic_counter< uint64_t > outstanding_reqs_{0}; - bool shutdown_started_{false}; + std::atomic_flag shutdown_started_{}; std::atomic< bool > is_restricted_{false}; // avoid taking lock in IO path; - folly::Promise< folly::Unit > shutdown_promise_; + std::promise< void > shutdown_promise_; iomgr::timer_handle_t vol_gc_timer_hdl_{iomgr::null_timer_handle}; iomgr::timer_handle_t shutdown_timer_hdl_{iomgr::null_timer_handle}; @@ -99,7 +103,7 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab static shared< HomeBlocksImpl > s_instance_; public: - explicit HomeBlocksImpl(std::weak_ptr< HomeBlocksApplication >&& application); + explicit HomeBlocksImpl(home_blocks_config&& cfg); ~HomeBlocksImpl() override = default; HomeBlocksImpl(const HomeBlocksImpl&) = delete; @@ -107,11 +111,8 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab HomeBlocksImpl& operator=(const HomeBlocksImpl&) = delete; HomeBlocksImpl& operator=(HomeBlocksImpl&&) noexcept = delete; - shared< VolumeManager > volume_manager() final; - - /// HomeBlocks - /// Returns the UUID of this HomeBlocks. - HomeBlocksStats get_stats() const final; + /// home_blocks + home_blocks_stats get_stats() const final; iomgr::drive_type data_drive_type() const final; peer_id_t our_uuid() const final { return our_uuid_; } @@ -120,26 +121,15 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab void shutdown() final; - /// VolumeManager - NullAsyncResult create_volume(VolumeInfo&& vol_info) final; - - NullAsyncResult remove_volume(const volume_id_t& id) final; - - VolumePtr lookup_volume(const volume_id_t& id) final; - - NullAsyncResult write(const VolumePtr& vol, const vol_interface_req_ptr& req) final; + /// volume control plane + [[nodiscard]] async_result< volume_handle > create_volume(volume_info info) final; - NullAsyncResult read(const VolumePtr& vol, const vol_interface_req_ptr& req) final; + [[nodiscard]] async_status remove_volume(const volume_id_t& id) final; - NullAsyncResult unmap(const VolumePtr& vol, const vol_interface_req_ptr& req) final; + [[nodiscard]] result< volume_handle > get_volume(const volume_id_t& id) const final; - // Submit the io batch, which is a mandatory method to be called if read/write are issued - // with part_of_batchis set to true. - void submit_io_batch() final; - - // see api comments in base class; - bool get_stats(volume_id_t id, VolumeStats& stats) const final; - void get_volume_ids(std::vector< volume_id_t >& vol_ids) const final; + [[nodiscard]] result< volume_stats > get_stats(volume_id_t id) const final; + std::vector< volume_id_t > volume_ids() const final; // Index shared< hs_index_table_t > recover_index_table(homestore::superblk< homestore::index_table_sb >&& sb); @@ -153,14 +143,15 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab void on_init_complete(); void on_write(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - const std::vector< homestore::MultiBlkId >& blkids, cintrusive< homestore::repl_req_ctx >& ctx); + const std::vector< homestore::multi_blk_id >& blkids, cintrusive< homestore::repl_req_ctx >& ctx); void start_reaper_thread(); - void fault_containment(const VolumePtr vol, const std::string& reason = ""); + void fault_containment(const volume_handle vol, const std::string& reason = ""); bool fc_on() const; - void exit_fc(VolumePtr& vol); + void exit_fc(volume_handle& vol); bool is_restricted() const { return is_restricted_.load(); } + bool is_shutting_down() const { return shutdown_started_.test(); } hs_chunk_size_cfg_t get_chunk_size() const; bool is_graceful_shutdown() const { return gracefully_shutdown_; } @@ -179,11 +170,9 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab void superblk_init(); void register_metablk_cb(); - void get_dev_info(shared< HomeBlocksApplication > app, std::vector< homestore::dev_info >& device_info, - bool& has_data_dev, bool& has_fast_dev); + void get_dev_info(std::vector< homestore::dev_info >& device_info, bool& has_data_dev, bool& has_fast_dev); - DevType get_device_type(std::string const& devname); - auto defer() const { return folly::makeSemiFuture().via(executor_); } + dev_type get_device_type(std::string const& devname); void update_vol_sb_cb(uint64_t volume_ordinal, const std::vector< chunk_num_t >& chunk_ids); @@ -193,24 +182,27 @@ class HomeBlocksImpl : public HomeBlocks, public VolumeManager, public std::enab void vol_gc(); + // Coroutine driving the actual volume teardown (co_awaits volume::destroy); launched via detach on a + // worker reactor by remove_volume so the reactor yields during the destroy's CP flush instead of parking. + sisl::async::task< void > do_remove_volume(volume_id_t id); + uint64_t gc_timer_nsecs() const; void inc_ref(uint64_t n = 1) { outstanding_reqs_.increment(n); } void dec_ref(uint64_t n = 1) { outstanding_reqs_.decrement(n); } - bool is_shutting_down() const { return shutdown_started_; } bool can_shutdown() const; bool no_outstanding_vols() const; - folly::Future< folly::Unit > shutdown_start(); + std::future< void > shutdown_start(); void do_shutdown(); uint64_t shutdown_timer_nsecs() const; #ifdef _PRERELEASE // For testing purpose only // If delay flip is not set, false will be returned; - // If delay flip is set, it will delay the IOs for a given VolumePtr - bool delay_fake_io(VolumePtr vol); + // If delay flip is set, it will delay the IOs for a given volume_handle + bool delay_fake_io(volume_handle vol); bool crash_simulated_{false}; #endif }; @@ -247,11 +239,15 @@ class HBFCSvcCB : public homestore::FaultContainmentCallback { } auto vol_id = static_cast< volume_id_t* >(cookie); - auto vol = hb_->lookup_volume(*vol_id); + auto vol = hb_->get_volume(*vol_id); + if (!vol) { + LOGW("Fault containment event for unknown volume {}, ignoring", boost::uuids::to_string(*vol_id)); + return; + } if (event == homestore::FaultContainmentEvent::ENTER) { - hb_->fault_containment(vol, reason); + hb_->fault_containment(*vol, reason); } else if (event == homestore::FaultContainmentEvent::EXIT) { - hb_->exit_fc(vol); + hb_->exit_fc(*vol); } } diff --git a/src/lib/listener.cpp b/src/lib/listener.cpp index b7b51ed..c1f67ea 100644 --- a/src/lib/listener.cpp +++ b/src/lib/listener.cpp @@ -18,12 +18,12 @@ namespace homeblocks { void HBListener::on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, - std::vector< homestore::MultiBlkId > const& blkids, + std::vector< homestore::multi_blk_id > const& blkids, cintrusive< homestore::repl_req_ctx >& ctx) { // on_commit called whenever journal has flushed log entries. header contains the msg type and volume // id, key contains the list of checksum, list of old blkids for write case. blkid's are the new blkid's // where data is written. - const MsgHeader* msg_header = r_cast< const MsgHeader* >(header.cbytes()); + const MsgHeader* msg_header = reinterpret_cast< const MsgHeader* >(header.cbytes()); switch (msg_header->msg_type) { case MsgType::WRITE: hb_->on_write(lsn, header, key, blkids, ctx); @@ -44,7 +44,7 @@ bool HBListener::on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl void HBListener::on_error(homestore::ReplServiceError error, const sisl::blob& header, const sisl::blob& key, cintrusive< homestore::repl_req_ctx >& ctx) {} -homestore::ReplResult< homestore::blk_alloc_hints > +homestore::result< homestore::blk_alloc_hints > HBListener::get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) { return homestore::blk_alloc_hints(); diff --git a/src/lib/listener.hpp b/src/lib/listener.hpp index e035749..59e8ca8 100644 --- a/src/lib/listener.hpp +++ b/src/lib/listener.hpp @@ -15,26 +15,21 @@ *********************************************************************************/ #pragma once -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include -#pragma GCC diagnostic pop -#include +#include #include "homeblks_impl.hpp" namespace homeblocks { class HomeBlocksImpl; -class HBListener : public homestore::ReplDevListener { +class HBListener : public homestore::repl_dev_listener { public: explicit HBListener(HomeBlocksImpl* hb) : hb_(hb) {} ~HBListener() = default; void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, - std::vector< homestore::MultiBlkId > const& blkids, + std::vector< homestore::multi_blk_id > const& blkids, cintrusive< homestore::repl_req_ctx >& ctx) override; bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, @@ -43,10 +38,10 @@ class HBListener : public homestore::ReplDevListener { void on_error(homestore::ReplServiceError error, const sisl::blob& header, const sisl::blob& key, cintrusive< homestore::repl_req_ctx >& ctx) override; - homestore::ReplResult< homestore::blk_alloc_hints > + homestore::result< homestore::blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override; - // group_id is the uuid generated by HomeBlocks when create_volume->create_repl_dev(gid) is called; + // group_id is the uuid generated by home_blocks when create_volume->create_repl_dev(gid) is called; // when volume is being destroyed, on_destroy is going to be triggered to listener that this volume's gid is // destroyed; void on_destroy(const homestore::group_id_t& group_id) override; @@ -64,8 +59,8 @@ class HBListener : public homestore::ReplDevListener { cintrusive< homestore::repl_req_ctx >& ctx) override {} void on_restart() override { LOGD("HBListener::on_restart()"); } /// @brief Called when the snapshot is being created by nuraft; - homestore::AsyncReplResult<> create_snapshot(std::shared_ptr< homestore::snapshot_context > context) override { - return folly::makeSemiFuture< homestore::ReplResult< folly::Unit > >(folly::Unit{}); + homestore::async_status create_snapshot(std::shared_ptr< homestore::snapshot_context > context) override { + co_return homestore::ok(); } bool apply_snapshot(std::shared_ptr< homestore::snapshot_context > context) override { return true; } std::shared_ptr< homestore::snapshot_context > last_snapshot() override { return nullptr; } diff --git a/src/lib/memory_backend/CMakeLists.txt b/src/lib/memory_backend/CMakeLists.txt index 2290273..ea22f0f 100644 --- a/src/lib/memory_backend/CMakeLists.txt +++ b/src/lib/memory_backend/CMakeLists.txt @@ -19,4 +19,4 @@ target_link_libraries(memory_test ${COMMON_TEST_DEPS} -rdynamic ) -add_test(NAME MemoryTestCPU COMMAND memory_test -csv error --executor cpu) +add_test(NAME MemoryTestCPU COMMAND memory_test -csv error) diff --git a/src/lib/memory_backend/mem_homeblks.cpp b/src/lib/memory_backend/mem_homeblks.cpp index 63d3213..ea4f088 100644 --- a/src/lib/memory_backend/mem_homeblks.cpp +++ b/src/lib/memory_backend/mem_homeblks.cpp @@ -3,13 +3,10 @@ namespace homeblocks { /// NOTE: We give ourselves the option to provide a different HR instance here than libhomeblocks.a -extern std::shared_ptr< HomeBlocks > init_homeblocks(std::weak_ptr< HomeBlocksApplication >&& application) { - return std::make_shared< MemoryHomeBlocks >(std::move(application)); +result< shared< home_blocks > > init_homeblocks(home_blocks_config cfg) { + return shared< home_blocks >(std::make_shared< MemoryHomeBlocks >(std::move(cfg))); } -MemoryHomeBlocks::MemoryHomeBlocks(std::weak_ptr< HomeBlocksApplication >&& application) : - HomeBlocksImpl::HomeBlocksImpl(std::move(application)) { - // _our_id = _application.lock()->discover_svcid(std::nullopt); -} +MemoryHomeBlocks::MemoryHomeBlocks(home_blocks_config&& cfg) : HomeBlocksImpl::HomeBlocksImpl(std::move(cfg)) {} } // namespace homeblocks diff --git a/src/lib/memory_backend/mem_homeblks.hpp b/src/lib/memory_backend/mem_homeblks.hpp index 8f8fa97..503c304 100644 --- a/src/lib/memory_backend/mem_homeblks.hpp +++ b/src/lib/memory_backend/mem_homeblks.hpp @@ -3,14 +3,13 @@ #include #include -#include #include "lib/homeblks_impl.hpp" namespace homeblocks { class MemoryHomeBlocks : public HomeBlocksImpl { public: - MemoryHomeBlocks(std::weak_ptr< HomeBlocksApplication >&& application); + MemoryHomeBlocks(home_blocks_config&& cfg); ~MemoryHomeBlocks() override = default; }; diff --git a/src/lib/tests/fixture_app.cpp b/src/lib/tests/fixture_app.cpp index dc0296e..f3d93bb 100644 --- a/src/lib/tests/fixture_app.cpp +++ b/src/lib/tests/fixture_app.cpp @@ -1,21 +1,18 @@ #include -#include #include #include -#include +#include "hb_internal.hpp" SISL_LOGGING_INIT(HOMEBLOCKS_LOG_MODS) -SISL_OPTIONS_ENABLE(logging, homeblocks) +SISL_OPTIONS_ENABLE(logging) int main(int argc, char* argv[]) { int parsed_argc = argc; ::testing::InitGoogleTest(&parsed_argc, argv); - SISL_OPTIONS_LOAD(parsed_argc, argv, logging, homeblocks); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging); sisl::logging::SetLogger(std::string(argv[0])); sisl::logging::SetLogPattern("[%D %T%z] [%^%L%$] [%t] %v"); - parsed_argc = 1; - auto f = ::folly::Init(&parsed_argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/src/lib/volume/CMakeLists.txt b/src/lib/volume/CMakeLists.txt index e1a7605..47739ce 100644 --- a/src/lib/volume/CMakeLists.txt +++ b/src/lib/volume/CMakeLists.txt @@ -2,15 +2,16 @@ cmake_minimum_required (VERSION 3.11) add_flags("-Wno-unused-parameter") -add_library("${PROJECT_NAME}_volume") +add_library("${PROJECT_NAME}") -target_sources("${PROJECT_NAME}_volume" PRIVATE +target_sources("${PROJECT_NAME}" PRIVATE volume.cpp volume_chunk_selector.cpp $ + $ ) -target_link_libraries("${PROJECT_NAME}_volume" PUBLIC +target_link_libraries("${PROJECT_NAME}" PUBLIC homestore::homestore ${COMMON_DEPS} ) diff --git a/src/lib/volume/index_fixed_kv.hpp b/src/lib/volume/index_fixed_kv.hpp index 76ad925..86d827b 100644 --- a/src/lib/volume/index_fixed_kv.hpp +++ b/src/lib/volume/index_fixed_kv.hpp @@ -3,10 +3,10 @@ #include #include #include -#include +#include #include -using homestore::BlkId; +using homestore::blk_id; using homestore::BtreeKey; using homestore::BtreeValue; @@ -15,8 +15,8 @@ using lba_t = std::uint64_t; struct BlockInfo { // Checksum calculated on new data and written to new_blkid. - homestore::BlkId new_blkid; - homestore::BlkId old_blkid; + homestore::blk_id new_blkid; + homestore::blk_id old_blkid; homestore::csum_t new_checksum; homestore::csum_t old_checksum{0}; }; @@ -33,7 +33,7 @@ class VolumeIndexKey : public homestore::BtreeKey { VolumeIndexKey(const VolumeIndexKey& other) = default; VolumeIndexKey(const BtreeKey& other) : VolumeIndexKey(other.serialize(), true) {} VolumeIndexKey(const sisl::blob& b, bool copy) : homestore::BtreeKey() { - VolumeIndexKey const* other = r_cast< VolumeIndexKey const* >(b.cbytes()); + VolumeIndexKey const* other = reinterpret_cast< VolumeIndexKey const* >(b.cbytes()); m_lba = other->m_lba; } @@ -45,7 +45,7 @@ class VolumeIndexKey : public homestore::BtreeKey { /////////////////// Overriding methods of BtreeKey ///////////////// int compare(homestore::BtreeKey const& o) const override { - VolumeIndexKey const& other = s_cast< VolumeIndexKey const& >(o); + VolumeIndexKey const& other = static_cast< VolumeIndexKey const& >(o); if (m_lba < other.m_lba) { return -1; } else if (m_lba > other.m_lba) { @@ -56,14 +56,15 @@ class VolumeIndexKey : public homestore::BtreeKey { } sisl::blob serialize() const override { - return sisl::blob{uintptr_cast(const_cast< VolumeIndexKey* >(this)), uint32_cast(sizeof(VolumeIndexKey))}; + return sisl::blob{reinterpret_cast< uint8_t* >(const_cast< VolumeIndexKey* >(this)), + static_cast< uint32_t >(sizeof(VolumeIndexKey))}; } uint32_t serialized_size() const override { return sizeof(VolumeIndexKey); } void deserialize(sisl::blob const& b, bool copy) override { assert(b.size() == sizeof(VolumeIndexKey)); - VolumeIndexKey const* other = r_cast< VolumeIndexKey const* >(b.cbytes()); + VolumeIndexKey const* other = reinterpret_cast< VolumeIndexKey const* >(b.cbytes()); m_lba = other->m_lba; } @@ -101,35 +102,35 @@ class VolumeIndexValue : public homestore::BtreeValue { private: #pragma pack(1) // Store blkid and checksum as the value. - BlkId m_blkid; + blk_id m_blkid; homestore::csum_t m_checksum; #pragma pack() public: - VolumeIndexValue(const BlkId& base_blkid, homestore::csum_t csum) : + VolumeIndexValue(const blk_id& base_blkid, homestore::csum_t csum) : homestore::BtreeValue(), m_blkid(base_blkid), m_checksum(csum) {} - VolumeIndexValue(const BlkId& base_blkid) : VolumeIndexValue(base_blkid, 0) {} + VolumeIndexValue(const blk_id& base_blkid) : VolumeIndexValue(base_blkid, 0) {} VolumeIndexValue() = default; VolumeIndexValue(const VolumeIndexValue& other) : homestore::BtreeValue(), m_blkid(other.m_blkid), m_checksum(other.m_checksum) {} VolumeIndexValue(const sisl::blob& b, bool copy) : homestore::BtreeValue() { this->deserialize(b, copy); } virtual ~VolumeIndexValue() = default; - homestore::BlkId blkid() const { return m_blkid; } + homestore::blk_id blkid() const { return m_blkid; } homestore::csum_t checksum() const { return m_checksum; } ///////////////////////////// Overriding methods of BtreeValue ////////////////////////// VolumeIndexValue& operator=(const VolumeIndexValue& other) = default; sisl::blob serialize() const override { - sisl::blob b{r_cast< uint8_t const* >(this), sizeof(VolumeIndexValue)}; + sisl::blob b{reinterpret_cast< uint8_t const* >(this), sizeof(VolumeIndexValue)}; return b; } uint32_t serialized_size() const override { return sizeof(VolumeIndexValue); } static uint32_t get_fixed_size() { return sizeof(VolumeIndexValue); } void deserialize(const sisl::blob& b, bool) { - VolumeIndexValue const* other = r_cast< VolumeIndexValue const* >(b.cbytes()); + VolumeIndexValue const* other = reinterpret_cast< VolumeIndexValue const* >(b.cbytes()); m_blkid = other->m_blkid; m_checksum = other->m_checksum; } @@ -146,7 +147,7 @@ class VolumeIndexValue : public homestore::BtreeValue { uint32_t offset; char dummy; is >> base_val >> dummy >> offset; - v = VolumeIndexValue{BlkId{}}; + v = VolumeIndexValue{blk_id{}}; return is; } @@ -154,4 +155,4 @@ class VolumeIndexValue : public homestore::BtreeValue { return ((m_blkid == other.m_blkid) && (m_checksum == other.m_checksum)); } }; -} // namespace homeblocks \ No newline at end of file +} // namespace homeblocks diff --git a/src/lib/volume/index_fixed_table.hpp b/src/lib/volume/index_fixed_table.hpp index 34e3f27..ef6a1f3 100644 --- a/src/lib/volume/index_fixed_table.hpp +++ b/src/lib/volume/index_fixed_table.hpp @@ -22,13 +22,12 @@ class VolumeIndexTable { std::shared_ptr< hs_index_table_t > index_table() { return hs_index_table_; } - VolumeManager::Result< folly::Unit > write_to_index(lba_t start_lba, lba_t end_lba, - std::unordered_map< lba_t, BlockInfo >& blocks_info) { + status write_to_index(lba_t start_lba, lba_t end_lba, std::unordered_map< lba_t, BlockInfo >& blocks_info) { // Use filter callback to get the old blkid. homestore::put_filter_cb_t filter_cb = [&blocks_info](BtreeKey const& key, BtreeValue const& existing_value, BtreeValue const& value) { - auto lba = r_cast< const VolumeIndexKey& >(key).lba(); - auto& existing_value_vol_idx = r_cast< const VolumeIndexValue& >(existing_value); + auto lba = reinterpret_cast< const VolumeIndexKey& >(key).lba(); + auto& existing_value_vol_idx = reinterpret_cast< const VolumeIndexValue& >(existing_value); blocks_info[lba].old_blkid = existing_value_vol_idx.blkid(); blocks_info[lba].old_checksum = existing_value_vol_idx.checksum(); return homestore::put_filter_decision::replace; @@ -52,7 +51,7 @@ class VolumeIndexTable { // restore the blkid at this lba to the old value LOGINFO("vol_index_partial_put_failure flip is set, aborting"); value = VolumeIndexValue{blocks_info[lba].old_blkid, blocks_info[lba].old_checksum}; - blocks_info[lba].old_blkid = homestore::BlkId{}; + blocks_info[lba].old_blkid = homestore::blk_id{}; blocks_info[lba].old_checksum = 0; auto req1 = homestore::BtreeSinglePutRequest{&key, &value, homestore::btree_put_type::UPSERT}; if (auto restore_lba_result = hs_index_table_->put(req1); @@ -67,19 +66,19 @@ class VolumeIndexTable { LOGERROR("Failed to put to index {}, error={}", lba, result); // rollback the lbas for which we have already written to the index table rollback_write(start_lba, lba - 1, blocks_info); - return std::unexpected(VolumeError::INDEX_ERROR); + return std::unexpected(volume_error::INDEX_ERROR); } } - return folly::Unit(); + return ok(); } - VolumeManager::NullResult read_from_index(const vol_interface_req_ptr& req, index_kv_list_t& index_kvs) { + status read_from_index(lba_t start_lba, lba_t end_lba, index_kv_list_t& index_kvs) { homestore::BtreeQueryRequest< VolumeIndexKey > qreq{ - homestore::BtreeKeyRange< VolumeIndexKey >{VolumeIndexKey{req->lba}, VolumeIndexKey{req->end_lba()}}, + homestore::BtreeKeyRange< VolumeIndexKey >{VolumeIndexKey{start_lba}, VolumeIndexKey{end_lba}}, homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY}; if (auto ret = hs_index_table_->query(qreq, index_kvs); ret != homestore::btree_status_t::success) { - return std::unexpected(VolumeError::INDEX_ERROR); + return std::unexpected(volume_error::INDEX_ERROR); } return {}; } @@ -104,9 +103,11 @@ class VolumeIndexTable { } } - void destroy() { + sisl::async::task< void > destroy() { homestore::hs()->index_service().remove_index_table(hs_index_table_); - hs_index_table_->destroy(); + // IndexTable::destroy() is a coroutine (co_awaits a forced CP flush); co_await it so the caller's + // reactor yields rather than blocking the flush. + co_await hs_index_table_->destroy(); } }; diff --git a/src/lib/volume/index_prefix_kv.hpp b/src/lib/volume/index_prefix_kv.hpp index a3bd279..ca83f83 100644 --- a/src/lib/volume/index_prefix_kv.hpp +++ b/src/lib/volume/index_prefix_kv.hpp @@ -18,10 +18,10 @@ #include #include #include -#include +#include #include -using homestore::BlkId; +using homestore::blk_id; using homestore::BtreeKey; using homestore::BtreeValue; @@ -30,8 +30,8 @@ using lba_t = std::uint64_t; struct BlockInfo { // Checksum calculated on new data and written to new_blkid. - homestore::BlkId new_blkid; - homestore::BlkId old_blkid; + homestore::blk_id new_blkid; + homestore::blk_id old_blkid; homestore::csum_t new_checksum; }; @@ -51,15 +51,15 @@ class VolumeIndexKey : public homestore::BtreeIntervalKey { public: VolumeIndexKey() = default; VolumeIndexKey(lba_t k) { - m_lba_base = uint32_cast(k >> 32); - m_lba_offset = uint32_cast(k & 0xFFFFFFFF); + m_lba_base = static_cast< uint32_t >(k >> 32); + m_lba_offset = static_cast< uint32_t >(k & 0xFFFFFFFF); } VolumeIndexKey(uint32_t b, uint32_t o) : m_lba_base{b}, m_lba_offset{o} {} VolumeIndexKey(const VolumeIndexKey& other) = default; VolumeIndexKey(const BtreeKey& other) : VolumeIndexKey(other.serialize(), true) {} VolumeIndexKey(const sisl::blob& b, bool copy) : homestore::BtreeIntervalKey() { - VolumeIndexKey const* other = r_cast< VolumeIndexKey const* >(b.cbytes()); + VolumeIndexKey const* other = reinterpret_cast< VolumeIndexKey const* >(b.cbytes()); m_lba_base = other->m_lba_base; m_lba_offset = other->m_lba_offset; } @@ -73,7 +73,7 @@ class VolumeIndexKey : public homestore::BtreeIntervalKey { /////////////////// Overriding methods of BtreeKey ///////////////// int compare(homestore::BtreeKey const& o) const override { - VolumeIndexKey const& other = s_cast< VolumeIndexKey const& >(o); + VolumeIndexKey const& other = static_cast< VolumeIndexKey const& >(o); if (m_lba_base < other.m_lba_base) { return -1; } else if (m_lba_base > other.m_lba_base) { @@ -88,14 +88,15 @@ class VolumeIndexKey : public homestore::BtreeIntervalKey { } sisl::blob serialize() const override { - return sisl::blob{uintptr_cast(const_cast< VolumeIndexKey* >(this)), uint32_cast(sizeof(VolumeIndexKey))}; + return sisl::blob{reinterpret_cast< uint8_t* >(const_cast< VolumeIndexKey* >(this)), + static_cast< uint32_t >(sizeof(VolumeIndexKey))}; } uint32_t serialized_size() const override { return sizeof(VolumeIndexKey); } void deserialize(sisl::blob const& b, bool copy) override { assert(b.size() == sizeof(VolumeIndexKey)); - VolumeIndexKey const* other = r_cast< VolumeIndexKey const* >(b.cbytes()); + VolumeIndexKey const* other = reinterpret_cast< VolumeIndexKey const* >(b.cbytes()); m_lba_base = other->m_lba_base; m_lba_offset = other->m_lba_offset; } @@ -116,7 +117,7 @@ class VolumeIndexKey : public homestore::BtreeIntervalKey { } int distance(BtreeKey const& f) const override { - VolumeIndexKey const& from = s_cast< VolumeIndexKey const& >(f); + VolumeIndexKey const& from = static_cast< VolumeIndexKey const& >(f); DEBUG_ASSERT_EQ(m_lba_base, from.m_lba_base, "Invalid from key for distance"); DEBUG_ASSERT_GE(m_lba_offset, from.m_lba_offset, "Invalid from key for distance"); return m_lba_offset - from.m_lba_offset; @@ -125,24 +126,26 @@ class VolumeIndexKey : public homestore::BtreeIntervalKey { bool is_interval_key() const override { return true; } sisl::blob serialize_prefix() const override { - return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_lba_base)), uint32_cast(sizeof(uint32_t))}; + return sisl::blob{reinterpret_cast< uint8_t* >(const_cast< uint32_t* >(&m_lba_base)), + static_cast< uint32_t >(sizeof(uint32_t))}; } sisl::blob serialize_suffix() const override { - return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_lba_offset)), uint32_cast(sizeof(uint32_t))}; + return sisl::blob{reinterpret_cast< uint8_t* >(const_cast< uint32_t* >(&m_lba_offset)), + static_cast< uint32_t >(sizeof(uint32_t))}; } - uint32_t serialized_prefix_size() const override { return uint32_cast(sizeof(uint32_t)); } + uint32_t serialized_prefix_size() const override { return static_cast< uint32_t >(sizeof(uint32_t)); } - uint32_t serialized_suffix_size() const override { return uint32_cast(sizeof(uint32_t)); }; + uint32_t serialized_suffix_size() const override { return static_cast< uint32_t >(sizeof(uint32_t)); }; void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) { DEBUG_ASSERT_EQ(prefix.size(), sizeof(uint32_t), "Invalid prefix size on deserialize"); DEBUG_ASSERT_EQ(suffix.size(), sizeof(uint32_t), "Invalid suffix size on deserialize"); - uint32_t const* other_p = r_cast< uint32_t const* >(prefix.cbytes()); + uint32_t const* other_p = reinterpret_cast< uint32_t const* >(prefix.cbytes()); m_lba_base = *other_p; - uint32_t const* other_s = r_cast< uint32_t const* >(suffix.cbytes()); + uint32_t const* other_s = reinterpret_cast< uint32_t const* >(suffix.cbytes()); m_lba_offset = *other_s; } @@ -150,7 +153,7 @@ class VolumeIndexKey : public homestore::BtreeIntervalKey { bool operator<(const VolumeIndexKey& o) const { return (compare(o) < 0); } bool operator==(const VolumeIndexKey& other) const { return (compare(other) == 0); } - lba_t key() const { return (uint64_cast(m_lba_base) << 32) | m_lba_offset; } + lba_t key() const { return (static_cast< uint64_t >(m_lba_base) << 32) | m_lba_offset; } lba_t lba() const { return key(); } lba_t start_key(const homestore::BtreeKeyRange< VolumeIndexKey >& range) const { const VolumeIndexKey& k = (const VolumeIndexKey&)(range.start_key()); @@ -180,7 +183,7 @@ class VolumeIndexKey : public homestore::BtreeIntervalKey { class VolumeIndexValue : public homestore::BtreeIntervalValue { private: #pragma pack(1) - // Store blkid and checksum as the value. Most significant 32 bits of BlkId contains chunk_num + // Store blkid and checksum as the value. Most significant 32 bits of blk_id contains chunk_num // and num_blks which is same in a single blkid and used as the prefix. Checksum and least significant 32 bits which // contains blk num are unique and used as suffix. Ignore the multiblkid bit. uint32_t m_blkid_prefix; @@ -189,12 +192,12 @@ class VolumeIndexValue : public homestore::BtreeIntervalValue { #pragma pack() public: - VolumeIndexValue(const BlkId& base_blkid, homestore::csum_t csum) : homestore::BtreeIntervalValue() { - m_blkid_suffix = uint32_cast(base_blkid.to_integer() & 0xFFFFFFFF) >> 1; - m_blkid_prefix = uint32_cast(base_blkid.to_integer() >> 32); + VolumeIndexValue(const blk_id& base_blkid, homestore::csum_t csum) : homestore::BtreeIntervalValue() { + m_blkid_suffix = static_cast< uint32_t >(base_blkid.to_integer() & 0xFFFFFFFF) >> 1; + m_blkid_prefix = static_cast< uint32_t >(base_blkid.to_integer() >> 32); m_checksum = csum; } - VolumeIndexValue(const BlkId& base_blkid) : VolumeIndexValue(base_blkid, 0) {} + VolumeIndexValue(const blk_id& base_blkid) : VolumeIndexValue(base_blkid, 0) {} VolumeIndexValue() = default; VolumeIndexValue(const VolumeIndexValue& other) : homestore::BtreeIntervalValue(), @@ -204,11 +207,11 @@ class VolumeIndexValue : public homestore::BtreeIntervalValue { VolumeIndexValue(const sisl::blob& b, bool copy) : homestore::BtreeIntervalValue() { this->deserialize(b, copy); } virtual ~VolumeIndexValue() = default; - homestore::BlkId blkid() const { + homestore::blk_id blkid() const { homestore::blk_num_t blk_num = m_blkid_suffix; homestore::chunk_num_t chunk_num = m_blkid_prefix >> 16; homestore::blk_count_t nblks = m_blkid_prefix & 0xFFFF; - return BlkId{blk_num, nblks, chunk_num}; + return blk_id{blk_num, nblks, chunk_num}; } homestore::csum_t checksum() const { return m_checksum; } @@ -216,14 +219,14 @@ class VolumeIndexValue : public homestore::BtreeIntervalValue { ///////////////////////////// Overriding methods of BtreeValue ////////////////////////// VolumeIndexValue& operator=(const VolumeIndexValue& other) = default; sisl::blob serialize() const override { - sisl::blob b{r_cast< uint8_t const* >(this), sizeof(VolumeIndexValue)}; + sisl::blob b{reinterpret_cast< uint8_t const* >(this), sizeof(VolumeIndexValue)}; return b; } uint32_t serialized_size() const override { return sizeof(VolumeIndexValue); } static uint32_t get_fixed_size() { return sizeof(VolumeIndexValue); } void deserialize(const sisl::blob& b, bool) { - VolumeIndexValue const* other = r_cast< VolumeIndexValue const* >(b.cbytes()); + VolumeIndexValue const* other = reinterpret_cast< VolumeIndexValue const* >(b.cbytes()); m_blkid_prefix = other->m_blkid_prefix; m_blkid_suffix = other->m_blkid_suffix; m_checksum = other->m_checksum; @@ -241,13 +244,13 @@ class VolumeIndexValue : public homestore::BtreeIntervalValue { uint32_t offset; char dummy; is >> base_val >> dummy >> offset; - v = VolumeIndexValue{BlkId{}}; + v = VolumeIndexValue{blk_id{}}; return is; } ///////////////////////////// Overriding methods of BtreeIntervalValue ////////////////////////// void shift(int n, void* app_ctx) override { - auto ctx = r_cast< IndexValueContext* >(app_ctx); + auto ctx = reinterpret_cast< IndexValueContext* >(app_ctx); DEBUG_ASSERT(ctx, "Context null"); // Get the next blk num and checksum @@ -258,25 +261,26 @@ class VolumeIndexValue : public homestore::BtreeIntervalValue { } sisl::blob serialize_prefix() const override { - return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_blkid_prefix)), uint32_cast(sizeof(uint32_t))}; + return sisl::blob{reinterpret_cast< uint8_t* >(const_cast< uint32_t* >(&m_blkid_prefix)), + static_cast< uint32_t >(sizeof(uint32_t))}; } sisl::blob serialize_suffix() const override { // Include both m_blkid_suffix and checksum in the suffix. - return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_blkid_suffix)), - uint32_cast(sizeof(uint32_t) + sizeof(homestore::csum_t))}; + return sisl::blob{reinterpret_cast< uint8_t* >(const_cast< uint32_t* >(&m_blkid_suffix)), + static_cast< uint32_t >(sizeof(uint32_t) + sizeof(homestore::csum_t))}; } - uint32_t serialized_prefix_size() const override { return uint32_cast(sizeof(uint32_t)); } + uint32_t serialized_prefix_size() const override { return static_cast< uint32_t >(sizeof(uint32_t)); } uint32_t serialized_suffix_size() const override { - return uint32_cast(sizeof(uint32_t) + sizeof(homestore::csum_t)); + return static_cast< uint32_t >(sizeof(uint32_t) + sizeof(homestore::csum_t)); } void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) override { DEBUG_ASSERT_EQ(prefix.size(), sizeof(uint32_t), "Invalid prefix size on deserialize"); DEBUG_ASSERT_EQ(suffix.size(), sizeof(uint32_t) + sizeof(homestore::csum_t), "Invalid suffix size on deserialize"); - m_blkid_prefix = *(r_cast< uint32_t const* >(prefix.cbytes())); - m_blkid_suffix = *(r_cast< uint32_t const* >(suffix.cbytes())); - m_checksum = *(r_cast< homestore::csum_t const* >(suffix.cbytes() + sizeof(uint32_t))); + m_blkid_prefix = *(reinterpret_cast< uint32_t const* >(prefix.cbytes())); + m_blkid_suffix = *(reinterpret_cast< uint32_t const* >(suffix.cbytes())); + m_checksum = *(reinterpret_cast< homestore::csum_t const* >(suffix.cbytes() + sizeof(uint32_t))); } bool operator==(VolumeIndexValue const& other) const { @@ -284,4 +288,4 @@ class VolumeIndexValue : public homestore::BtreeIntervalValue { (m_checksum == other.m_checksum)); } }; -} // namespace homeblocks \ No newline at end of file +} // namespace homeblocks diff --git a/src/lib/volume/index_prefix_table.hpp b/src/lib/volume/index_prefix_table.hpp index a5c8087..9165fd0 100644 --- a/src/lib/volume/index_prefix_table.hpp +++ b/src/lib/volume/index_prefix_table.hpp @@ -22,51 +22,52 @@ class VolumeIndexTable { std::shared_ptr< hs_index_table_t > index_table() { return hs_index_table_; } - VolumeManager::Result< folly::Unit > write_to_index(lba_t start_lba, lba_t end_lba, - std::unordered_map< lba_t, BlockInfo >& blocks_info) { + status write_to_index(lba_t start_lba, lba_t end_lba, std::unordered_map< lba_t, BlockInfo >& blocks_info) { // Use filter callback to get the old blkid. homestore::put_filter_cb_t filter_cb = [&blocks_info](BtreeKey const& key, BtreeValue const& existing_value, BtreeValue const& value) { - auto lba = r_cast< const VolumeIndexKey& >(key).key(); - blocks_info[lba].old_blkid = r_cast< const VolumeIndexValue& >(existing_value).blkid(); + auto lba = reinterpret_cast< const VolumeIndexKey& >(key).key(); + blocks_info[lba].old_blkid = reinterpret_cast< const VolumeIndexValue& >(existing_value).blkid(); return homestore::put_filter_decision::replace; }; // Write to prefix btree with key ranging from start_lba to end_lba. // For value shift() will get the blk_num and checksum for each lba. IndexValueContext app_ctx{&blocks_info, start_lba}; - const BlkId& start_blkid = blocks_info[start_lba].new_blkid; + const blk_id& start_blkid = blocks_info[start_lba].new_blkid; VolumeIndexValue value{start_blkid, blocks_info[start_lba].new_checksum}; auto req = homestore::BtreeRangePutRequest< VolumeIndexKey >{ homestore::BtreeKeyRange< VolumeIndexKey >{VolumeIndexKey{start_lba}, true, VolumeIndexKey{end_lba}, true}, homestore::btree_put_type::UPSERT, - r_cast< VolumeIndexValue* >(&value), - r_cast< void* >(&app_ctx), + reinterpret_cast< VolumeIndexValue* >(&value), + reinterpret_cast< void* >(&app_ctx), std::numeric_limits< uint32_t >::max() /* batch_size */, filter_cb}; auto result = hs_index_table_->put(req); if (result != homestore::btree_status_t::success) { LOGERROR("Failed to put to index range=({},{}) error={}", start_lba, end_lba, result); - return std::unexpected(VolumeError::INDEX_ERROR); + return std::unexpected(volume_error::INDEX_ERROR); } - return folly::Unit(); + return ok(); } - VolumeManager::Result< folly::Unit > read_from_index(const vol_interface_req_ptr& req, index_kv_list_t& index_kvs) { + status read_from_index(lba_t start_lba, lba_t end_lba, index_kv_list_t& index_kvs) { homestore::BtreeQueryRequest< VolumeIndexKey > qreq{ - homestore::BtreeKeyRange< VolumeIndexKey >{VolumeIndexKey{req->lba}, VolumeIndexKey{req->end_lba()}}, + homestore::BtreeKeyRange< VolumeIndexKey >{VolumeIndexKey{start_lba}, VolumeIndexKey{end_lba}}, homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY}; if (auto ret = hs_index_table_->query(qreq, index_kvs); ret != homestore::btree_status_t::success) { - return std::unexpected(VolumeError::INDEX_ERROR); + return std::unexpected(volume_error::INDEX_ERROR); } - return folly::Unit(); + return ok(); } - void destroy() { + sisl::async::task< void > destroy() { homestore::hs()->index_service().remove_index_table(hs_index_table_); - hs_index_table_->destroy(); + // IndexTable::destroy() is a coroutine (co_awaits a forced CP flush); co_await it so the caller's + // reactor yields rather than blocking the flush. + co_await hs_index_table_->destroy(); } }; diff --git a/src/lib/volume/io_req.hpp b/src/lib/volume/io_req.hpp new file mode 100644 index 0000000..b84f23d --- /dev/null +++ b/src/lib/volume/io_req.hpp @@ -0,0 +1,43 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include + +#include // sisl::Clock + +#include "hb_internal.hpp" + +namespace homeblocks { + +using volume_info_ptr = shared< volume_info >; + +// Plain by-value IO request. The free async_read/async_write functions build one on their coroutine frame and +// pass it by reference into volume::read/write -- the frame already provides storage that outlives the awaits, +// so (unlike the pre-coroutine design) there is no heap allocation or refcount here. The volume's in-flight +// count is tracked separately by a vol_io_guard (see volume.hpp). +struct io_req { + uint8_t* buffer{nullptr}; + lba_t lba{0}; + lba_count_t nlbas{0}; + sisl::Clock::time_point io_start_time{}; + sisl::Clock::time_point data_svc_start_time{}; + sisl::Clock::time_point index_start_time{}; + sisl::Clock::time_point journal_start_time{}; + + lba_t end_lba() const { return lba + nlbas - 1; } +}; + +} // namespace homeblocks diff --git a/src/lib/volume/tests/CMakeLists.txt b/src/lib/volume/tests/CMakeLists.txt index 5719747..5163b8a 100644 --- a/src/lib/volume/tests/CMakeLists.txt +++ b/src/lib/volume/tests/CMakeLists.txt @@ -6,7 +6,7 @@ target_sources(test_volume PRIVATE ) target_link_libraries(test_volume - ${PROJECT_NAME}_volume + ${PROJECT_NAME} ${COMMON_TEST_DEPS} -rdynamic ) @@ -17,7 +17,7 @@ target_sources(test_volume_io PRIVATE ) target_link_libraries(test_volume_io - ${PROJECT_NAME}_volume + ${PROJECT_NAME} ${COMMON_TEST_DEPS} -rdynamic ) @@ -27,7 +27,7 @@ target_sources(test_volume_chunk_selector PRIVATE test_volume_chunk_selector.cpp ) target_link_libraries(test_volume_chunk_selector - ${PROJECT_NAME}_volume + ${PROJECT_NAME} ${COMMON_TEST_DEPS} -rdynamic ) diff --git a/src/lib/volume/tests/test_common.hpp b/src/lib/volume/tests/test_common.hpp index 0588b5a..2c57318 100644 --- a/src/lib/volume/tests/test_common.hpp +++ b/src/lib/volume/tests/test_common.hpp @@ -13,26 +13,28 @@ * *********************************************************************************/ /* - * HomeBlocks Testing Binaries shared common define, apis and data structure; + * home_blocks Testing Binaries shared common define, apis and data structure; * */ #pragma once #include +#include #include #include #include #include #include #include +#include #include -#include #include #include #include #include #include #include -#include +#include "hb_internal.hpp" #include "lib/homeblks_impl.hpp" +#include "coro_helpers.hpp" SISL_OPTION_GROUP( test_common_setup, @@ -49,7 +51,6 @@ SISL_OPTION_GROUP( (qdepth, "", "qdepth", "Max outstanding operations", ::cxxopts::value< uint32_t >()->default_value("8"), "number"), (num_io_reactors, "", "num_io_reactors", "number of IO reactors", ::cxxopts::value< uint32_t >()->default_value("0"), "number"), - (spdk, "", "spdk", "spdk", ::cxxopts::value< bool >()->default_value("false"), "true or false"), (flip_list, "", "flip_list", "btree flip list", ::cxxopts::value< std::vector< std::string > >(), "flips [...]"), (use_file, "", "use_file", "use file instead of real drive", ::cxxopts::value< bool >()->default_value("false"), "true or false"), @@ -65,19 +66,16 @@ using namespace homeblocks; class test_http_server { public: - void get_prometheus_metrics(const Pistache::Rest::Request&, Pistache::Http::ResponseWriter response) { - response.send(Pistache::Http::Code::Ok, - sisl::MetricsFarm::getInstance().report(sisl::ReportFormat::kTextFormat)); - } - + // sisl's HttpServer moved from Pistache to httplib (handlers are httplib::Request/Response). void start() { auto http_server_ptr = ioenvironment.get_http_server(); - - std::vector< iomgr::http_route > routes = { - {Pistache::Http::Method::Get, "/metrics", - Pistache::Rest::Routes::bind(&test_http_server::get_prometheus_metrics, this), iomgr::url_t::safe}}; try { - http_server_ptr->setup_routes(routes); + http_server_ptr->setup_routes( + {{sisl::http_method::Get, "/metrics", + [](httplib::Request const&, httplib::Response& res) { + res.set_content(sisl::MetricsFarm::getInstance().report(sisl::TEXT_FORMAT), "text/plain"); + }, + sisl::url_type::safe}}); LOGINFO("Started http server "); } catch (std::runtime_error const& e) { LOGERROR("setup routes failed, {}", e.what()) } @@ -89,7 +87,8 @@ class test_http_server { namespace test_common { struct io_fiber_pool { - std::vector< iomgr::io_fiber_t > io_fibers_; + // iomgr v13 exposes reactors (IOReactor*) rather than fibers (io_fiber_t); run_on_forget takes an IOReactor*. + std::vector< iomgr::IOReactor* > io_fibers_; io_fiber_pool(uint32_t num_io_reactors) { struct Context { std::condition_variable cv; @@ -98,12 +97,12 @@ struct io_fiber_pool { }; auto ctx = std::make_shared< Context >(); for (uint32_t i{0}; i < num_io_reactors; ++i) { - iomanager.create_reactor("homeblks_long_running_io" + std::to_string(i), iomgr::INTERRUPT_LOOP, 1u, + iomanager.create_reactor("homeblks_long_running_io" + std::to_string(i), iomgr::INTERRUPT_LOOP, [this, ctx](bool is_started) { if (is_started) { { std::unique_lock< std::mutex > lk{ctx->mtx}; - io_fibers_.push_back(iomanager.iofiber_self()); + io_fibers_.push_back(iomanager.this_reactor()); ++(ctx->thread_cnt); } ctx->cv.notify_one(); @@ -128,7 +127,7 @@ struct Runner { std::atomic< uint64_t > issued_tasks_{0}; std::atomic< uint64_t > completed_tasks_{0}; std::function< void(void) > task_; - folly::Promise< folly::Unit > comp_promise_; + std::promise< void > comp_promise_; std::shared_ptr< io_fiber_pool > io_fiber_pool_; Runner(uint64_t num_tasks, uint32_t qd = 8, std::shared_ptr< io_fiber_pool > const& g_io_fiber_pool = nullptr) : @@ -143,15 +142,15 @@ struct Runner { void set_task(std::function< void(void) > f) { issued_tasks_.store(0); completed_tasks_.store(0); - comp_promise_ = folly::Promise< folly::Unit >{}; + comp_promise_ = std::promise< void >{}; task_ = std::move(f); } - folly::Future< folly::Unit > execute() { + std::future< void > execute() { for (uint32_t i{0}; i < qdepth_; ++i) { run_task(); } - return comp_promise_.getFuture(); + return comp_promise_.get_future(); } void next_task() { @@ -159,7 +158,7 @@ struct Runner { if ((issued_tasks_.load() < total_tasks_)) { run_task(); } else if ((ctasks + 1) == total_tasks_) { - comp_promise_.setValue(); + comp_promise_.set_value(); } } @@ -180,56 +179,45 @@ struct Runner { struct Waiter { std::atomic< uint64_t > expected_comp{0}; std::atomic< uint64_t > actual_comp{0}; - folly::Promise< folly::Unit > comp_promise; + std::promise< void > comp_promise; Waiter(uint64_t num_op) : expected_comp{num_op} {} Waiter() : Waiter{SISL_OPTIONS["num_io"].as< uint64_t >()} {} Waiter(const Waiter&) = delete; Waiter& operator=(const Waiter&) = delete; - folly::Future< folly::Unit > start(std::function< void(void) > f) { + std::future< void > start(std::function< void(void) > f) { f(); - return comp_promise.getFuture(); + return comp_promise.get_future(); } void one_complete() { - if ((actual_comp.fetch_add(1) + 1) >= expected_comp.load()) { comp_promise.setValue(); } + if ((actual_comp.fetch_add(1) + 1) >= expected_comp.load()) { comp_promise.set_value(); } } }; -class HBTestHelper { - class HBTestApplication : public homeblocks::HomeBlocksApplication { - private: - HBTestHelper& helper_; - - public: - HBTestApplication(HBTestHelper& h) : helper_{h} {} - virtual ~HBTestApplication() = default; - - // implement all the virtual functions in HomeObjectApplication - bool spdk_mode() const override { - // return SISL_OPTIONS["spdk"].as< bool >(); - return false; - } - uint32_t threads() const override { return SISL_OPTIONS["num_threads"].as< uint32_t >(); } - - std::list< device_info_t > devices() const override { - std::list< device_info_t > devs; - for (const auto& dev : helper_.dev_list()) { - devs.emplace_back(dev); - } - return devs; - } - - std::optional< peer_id_t > discover_svc_id(std::optional< peer_id_t > const&) const override { - return helper_.svc_id(); - } +// Block until every future is satisfied. Uses wait() rather than get() so the same vector can be awaited more than once +// and never throws on a value-only (void) completion. +inline void wait_all(std::vector< std::future< void > >& futs) { + for (auto& f : futs) { + if (f.valid()) { f.wait(); } + } +} - uint64_t app_mem_size() const override { - // return SISL_OPTIONS["app_mem_size"].as< uint64_t >(); - return 20; +class HBTestHelper { + // Build the bring-up config from test options. on_svc_id is a lazy coroutine that hands homeblocks our + // (cold-boot) svc id; it only fires on first boot -- on restart the id is recovered from the superblock. + homeblocks::home_blocks_config make_config() { + homeblocks::home_blocks_config cfg; + cfg.threads = SISL_OPTIONS["num_threads"].as< uint32_t >(); + cfg.app_mem_size_mb = 20 * 1024; // 20 GiB + for (const auto& dev : dev_list()) { + cfg.devices.emplace_back(dev); } - }; + auto id = svc_id(); + cfg.on_svc_id = [id]() -> homeblocks::async_result< homeblocks::peer_id_t > { co_return id; }; + return cfg; + } public: HBTestHelper(std::string const& name, std::vector< std::string > const& args, char** argv) : @@ -245,23 +233,26 @@ class HBTestHelper { // init device list init_dev_list(true /*init_device*/); - LOGINFO("Starting HomeBlocks"); + LOGINFO("Starting home_blocks"); // homeblocks::HomeBlocksImpl::_hs_chunk_size = SISL_OPTIONS["hs_chunk_size_mb"].as< uint64_t >() * Mi; // set_min_chunk_size(4 * Mi); - app_ = std::make_shared< HBTestApplication >(*this); - hb_ = init_homeblocks(std::weak_ptr< HBTestApplication >(app_)); + auto hb = init_homeblocks(make_config()); + RELEASE_ASSERT(hb.has_value(), "init_homeblocks failed"); + hb_ = hb.value(); } void restart(uint64_t delay_secs = 0) { - LOGINFO("Restart HomeBlocks"); + LOGINFO("Restart home_blocks"); hb_->shutdown(); hb_.reset(); - LOGINFO("Start HomeBlocks after {} secs", delay_secs); + LOGINFO("Start home_blocks after {} secs", delay_secs); if (delay_secs > 0) { std::this_thread::sleep_for(std::chrono::seconds(delay_secs)); } - hb_ = init_homeblocks(std::weak_ptr< HBTestApplication >(app_)); + auto hb = init_homeblocks(make_config()); + RELEASE_ASSERT(hb.has_value(), "init_homeblocks failed on restart"); + hb_ = hb.value(); } - shared< homeblocks::HomeBlocks > inst() { return hb_; } + shared< homeblocks::home_blocks > inst() { return hb_; } void teardown() { LOGINFO("Tearing down test."); @@ -276,14 +267,14 @@ class HBTestHelper { Waiter& waiter() { return waiter_; } static void fill_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { - uint64_t* ptr = r_cast< uint64_t* >(buf); + uint64_t* ptr = reinterpret_cast< uint64_t* >(buf); for (uint64_t i = 0ul; i < size / sizeof(uint64_t); ++i) { *(ptr + i) = (pattern == 0) ? i : pattern; } } static void validate_data_buf(uint8_t const* buf, uint64_t size, uint64_t pattern = 0) { - uint64_t const* ptr = r_cast< uint64_t const* >(buf); + uint64_t const* ptr = reinterpret_cast< uint64_t const* >(buf); for (uint64_t i = 0ul; i < size / sizeof(uint64_t); ++i) { RELEASE_ASSERT_EQ(ptr[i], ((pattern == 0) ? i : pattern), "data_buf mismatch at offset={}", i); } @@ -396,8 +387,7 @@ class HBTestHelper { std::vector< std::string > args_; char** argv_; std::vector< std::string > dev_list_; - shared< homeblocks::HomeBlocks > hb_; - shared< HBTestApplication > app_; + shared< homeblocks::home_blocks > hb_; peer_id_t svc_id_; Runner io_runner_; Waiter waiter_; diff --git a/src/lib/volume/tests/test_volume.cpp b/src/lib/volume/tests/test_volume.cpp index 243d7fc..0a0b901 100644 --- a/src/lib/volume/tests/test_volume.cpp +++ b/src/lib/volume/tests/test_volume.cpp @@ -15,13 +15,11 @@ #include -#include #include #include #include #include -#include -#include +#include "hb_internal.hpp" #include "test_common.hpp" SISL_LOGGING_INIT(HOMEBLOCKS_LOG_MODS) @@ -33,7 +31,7 @@ SISL_OPTION_GROUP(test_volume_setup, (shutdown_timer_nsecs, "", "shutdown_timer_nsecs", "shutdown timer in seconds", ::cxxopts::value< uint32_t >()->default_value("2"), "seconds")); -SISL_OPTIONS_ENABLE(logging, test_common_setup, test_volume_setup, homeblocks) +SISL_OPTIONS_ENABLE(logging, test_common_setup, test_volume_setup) SISL_LOGGING_DECL(test_volume) std::unique_ptr< test_common::HBTestHelper > g_helper; @@ -44,12 +42,12 @@ class VolumeTest : public ::testing::Test { public: void SetUp() override {} - VolumeInfo gen_vol_info(uint32_t vol_idx) { - VolumeInfo vol_info; + volume_info gen_vol_info(uint32_t vol_idx) { + volume_info vol_info; vol_info.name = "vol_" + std::to_string(vol_idx); vol_info.size_bytes = 1024 * 1024 * 1024; vol_info.page_size = 4096; - vol_info.id = hb_utils::gen_random_uuid(); + vol_info.id = boost::uuids::random_generator()(); return vol_info; } }; @@ -59,7 +57,7 @@ TEST_F(VolumeTest, ShutdownWithOutstandingRemoveVol) { std::vector< volume_id_t > vol_ids; { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; uint32_t delay_sec = 6; g_helper->set_delay_flip("vol_fake_io_delay_simulation", delay_sec * 1000 * 1000 /*delay_usec*/, 2, 100); @@ -70,20 +68,17 @@ TEST_F(VolumeTest, ShutdownWithOutstandingRemoveVol) { auto vinfo = gen_vol_info(i); auto id = vinfo.id; vol_ids.emplace_back(id); - auto ret = vol_mgr->create_volume(std::move(vinfo)).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->create_volume(std::move(vinfo))); ASSERT_TRUE(ret); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is there ASSERT_TRUE(vol_ptr != nullptr); - // fake a write that will be delayed; - vol_interface_req_ptr req1(new vol_interface_req{nullptr, 0, 0, vol_ptr}); - vol_mgr->write(vol_ptr, req1); - - // fake a read that will be delayed; - vol_interface_req_ptr req2(new vol_interface_req{nullptr, 0, 0, vol_ptr}); - vol_mgr->read(vol_ptr, req2); + // fake a write + read that will be delayed (the delay flip keeps them outstanding on the volume) + sisl::sg_list fake_sgs{.size = 0, .iovs = {iovec{nullptr, 0}}}; + homeblocks::detail::detach(homeblocks::async_write(vol_ptr, 0, fake_sgs)); + homeblocks::detail::detach(homeblocks::async_read(vol_ptr, 0, fake_sgs)); } auto const s = hb->get_stats(); @@ -92,7 +87,7 @@ TEST_F(VolumeTest, ShutdownWithOutstandingRemoveVol) { for (uint32_t i = 0; i < num_vols; ++i) { auto id = vol_ids[i]; - auto ret = vol_mgr->remove_volume(id).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->remove_volume(id)); ASSERT_TRUE(ret); } } @@ -106,7 +101,7 @@ TEST_F(VolumeTest, ShutdownWithOutstandingIO) { std::vector< volume_id_t > vol_ids; { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; uint32_t delay_sec = 6; g_helper->set_delay_flip("vol_fake_io_delay_simulation", delay_sec * 1000 * 1000 /*delay_usec*/, 2, 100); @@ -116,20 +111,17 @@ TEST_F(VolumeTest, ShutdownWithOutstandingIO) { auto vinfo = gen_vol_info(i); auto id = vinfo.id; vol_ids.emplace_back(id); - auto ret = vol_mgr->create_volume(std::move(vinfo)).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->create_volume(std::move(vinfo))); ASSERT_TRUE(ret); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is there ASSERT_TRUE(vol_ptr != nullptr); - // fake a write that will be delayed; - vol_interface_req_ptr req1(new vol_interface_req{nullptr, 0, 0, vol_ptr}); - vol_mgr->write(vol_ptr, req1); - - // fake a read that will be delayed; - vol_interface_req_ptr req2(new vol_interface_req{nullptr, 0, 0, vol_ptr}); - vol_mgr->read(vol_ptr, req2); + // fake a write + read that will be delayed (the delay flip keeps them outstanding on the volume) + sisl::sg_list fake_sgs{.size = 0, .iovs = {iovec{nullptr, 0}}}; + homeblocks::detail::detach(homeblocks::async_write(vol_ptr, 0, fake_sgs)); + homeblocks::detail::detach(homeblocks::async_read(vol_ptr, 0, fake_sgs)); } } @@ -143,7 +135,7 @@ TEST_F(VolumeTest, CreateDestroyVolumeWithOutstandingIO) { std::vector< volume_id_t > vol_ids; { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; uint32_t delay_sec = 6; g_helper->set_delay_flip("vol_fake_io_delay_simulation", delay_sec * 1000 * 1000 /*delay_usec*/, 2, 100); @@ -154,20 +146,17 @@ TEST_F(VolumeTest, CreateDestroyVolumeWithOutstandingIO) { auto vinfo = gen_vol_info(i); auto id = vinfo.id; vol_ids.emplace_back(id); - auto ret = vol_mgr->create_volume(std::move(vinfo)).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->create_volume(std::move(vinfo))); ASSERT_TRUE(ret); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is there ASSERT_TRUE(vol_ptr != nullptr); - // fake a write that will be delayed; - vol_interface_req_ptr req1(new vol_interface_req{nullptr, 0, 0, vol_ptr}); - vol_mgr->write(vol_ptr, req1); - - // fake a read that will be delayed; - vol_interface_req_ptr req2(new vol_interface_req{nullptr, 0, 0, vol_ptr}); - vol_mgr->read(vol_ptr, req2); + // fake a write + read that will be delayed (the delay flip keeps them outstanding on the volume) + sisl::sg_list fake_sgs{.size = 0, .iovs = {iovec{nullptr, 0}}}; + homeblocks::detail::detach(homeblocks::async_write(vol_ptr, 0, fake_sgs)); + homeblocks::detail::detach(homeblocks::async_read(vol_ptr, 0, fake_sgs)); } auto const s = hb->get_stats(); @@ -176,15 +165,15 @@ TEST_F(VolumeTest, CreateDestroyVolumeWithOutstandingIO) { for (uint32_t i = 0; i < num_vols; ++i) { auto id = vol_ids[i]; - auto ret = vol_mgr->remove_volume(id).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->remove_volume(id)); ASSERT_TRUE(ret); while (true) { auto delay_secs = 1; - LOGINFO("Remove Volume {} triggered, waiting for {} seconds for IO to complete", + LOGINFO("Remove volume {} triggered, waiting for {} seconds for IO to complete", boost::uuids::to_string(id), delay_secs); // sleep for a while std::this_thread::sleep_for(std::chrono::milliseconds(delay_secs * 1000)); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); if (!vol_ptr) { break; } } } @@ -200,7 +189,7 @@ TEST_F(VolumeTest, CreateDestroyVolume) { std::vector< volume_id_t > vol_ids; { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; auto num_vols = SISL_OPTIONS["num_vols"].as< uint32_t >(); @@ -208,10 +197,10 @@ TEST_F(VolumeTest, CreateDestroyVolume) { auto vinfo = gen_vol_info(i); auto id = vinfo.id; vol_ids.emplace_back(id); - auto ret = vol_mgr->create_volume(std::move(vinfo)).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->create_volume(std::move(vinfo))); ASSERT_TRUE(ret); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is there ASSERT_TRUE(vol_ptr != nullptr); } @@ -222,11 +211,11 @@ TEST_F(VolumeTest, CreateDestroyVolume) { for (uint32_t i = 0; i < num_vols; ++i) { auto id = vol_ids[i]; - auto ret = vol_mgr->remove_volume(id).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->remove_volume(id)); ASSERT_TRUE(ret); // sleep for a while std::this_thread::sleep_for(std::chrono::milliseconds(2000)); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is not there ASSERT_TRUE(vol_ptr == nullptr); @@ -240,7 +229,7 @@ TEST_F(VolumeTest, CreateVolumeThenRecover) { std::vector< volume_id_t > vol_ids; { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; auto num_vols = SISL_OPTIONS["num_vols"].as< uint32_t >(); @@ -248,10 +237,10 @@ TEST_F(VolumeTest, CreateVolumeThenRecover) { auto vinfo = gen_vol_info(i); auto id = vinfo.id; vol_ids.emplace_back(id); - auto ret = vol_mgr->create_volume(std::move(vinfo)).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->create_volume(std::move(vinfo))); ASSERT_TRUE(ret); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is there ASSERT_TRUE(vol_ptr != nullptr); } @@ -262,10 +251,10 @@ TEST_F(VolumeTest, CreateVolumeThenRecover) { // verify the volumes are still there { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; for (const auto& id : vol_ids) { - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is there ASSERT_TRUE(vol_ptr != nullptr); } @@ -273,13 +262,13 @@ TEST_F(VolumeTest, CreateVolumeThenRecover) { { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; for (const auto& id : vol_ids) { - auto ret = vol_mgr->remove_volume(id).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->remove_volume(id)); ASSERT_TRUE(ret); // sleep for a while std::this_thread::sleep_for(std::chrono::milliseconds(2000)); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is not there ASSERT_TRUE(vol_ptr == nullptr); } @@ -293,7 +282,7 @@ TEST_F(VolumeTest, DestroyVolumeCrashRecovery) { std::vector< volume_id_t > vol_ids; { auto hb = g_helper->inst(); - auto vol_mgr = hb->volume_manager(); + auto vol_mgr = hb; auto num_vols = SISL_OPTIONS["num_vols"].as< uint32_t >(); @@ -301,17 +290,17 @@ TEST_F(VolumeTest, DestroyVolumeCrashRecovery) { auto vinfo = gen_vol_info(i); auto id = vinfo.id; vol_ids.emplace_back(id); - auto ret = vol_mgr->create_volume(std::move(vinfo)).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->create_volume(std::move(vinfo))); ASSERT_TRUE(ret); - auto vol_ptr = vol_mgr->lookup_volume(id); + auto vol_ptr = vol_mgr->get_volume(id).value_or(nullptr); // verify the volume is there ASSERT_TRUE(vol_ptr != nullptr); } for (uint32_t i = 0; i < num_vols; ++i) { auto id = vol_ids[i]; - auto ret = vol_mgr->remove_volume(id).get(); + auto ret = homeblocks::detail::sync_get(vol_mgr->remove_volume(id)); ASSERT_TRUE(ret); } } @@ -329,10 +318,8 @@ int main(int argc, char* argv[]) { } ::testing::InitGoogleTest(&parsed_argc, argv); - SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_common_setup, test_volume_setup, homeblocks); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_common_setup, test_volume_setup); spdlog::set_pattern("[%D %T%z] [%^%l%$] [%n] [%t] %v"); - parsed_argc = 1; - auto f = ::folly::Init(&parsed_argc, &argv, true); g_helper = std::make_unique< test_common::HBTestHelper >("test_volume", args, orig_argv); g_helper->setup(); diff --git a/src/lib/volume/tests/test_volume_chunk_selector.cpp b/src/lib/volume/tests/test_volume_chunk_selector.cpp index bebcf17..9312be9 100644 --- a/src/lib/volume/tests/test_volume_chunk_selector.cpp +++ b/src/lib/volume/tests/test_volume_chunk_selector.cpp @@ -1,12 +1,10 @@ #include #include -#include #include #include #include #include -#include -#include +#include "hb_internal.hpp" #include #include #include "test_common.hpp" @@ -16,7 +14,7 @@ SISL_OPTION_GROUP(test_volume_chunk_selector, (num_vols, "", "num_vols", "number of volumes", ::cxxopts::value< uint32_t >()->default_value("2"), "number")); -SISL_OPTIONS_ENABLE(logging, test_common_setup, test_volume_chunk_selector, homeblocks) +SISL_OPTIONS_ENABLE(logging, test_common_setup, test_volume_chunk_selector) SISL_LOGGING_DECL(test_volume_chunk_selector) using namespace homeblocks; @@ -237,14 +235,12 @@ TEST_F(ChunkSelectorTest, RecoverChunksTest) { int main(int argc, char* argv[]) { int parsed_argc = argc; ::testing::InitGoogleTest(&parsed_argc, argv); - SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_common_setup, test_volume_chunk_selector, homeblocks); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_common_setup, test_volume_chunk_selector); spdlog::set_pattern("[%D %T%z] [%^%l%$] [%n] [%t] %v"); sisl::logging::SetLogger("test_volume_chunk_selector"); sisl::logging::SetLogPattern("[%D %T%z] [%^%L%$] [%n] [%t] %v"); ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = 4}); - parsed_argc = 1; - auto f = ::folly::Init(&parsed_argc, &argv, true); auto ret = RUN_ALL_TESTS(); iomanager.stop(); return ret; diff --git a/src/lib/volume/tests/test_volume_io.cpp b/src/lib/volume/tests/test_volume_io.cpp index 63d6c01..b468878 100644 --- a/src/lib/volume/tests/test_volume_io.cpp +++ b/src/lib/volume/tests/test_volume_io.cpp @@ -16,13 +16,11 @@ #include #include #include -#include #include #include #include #include -#include -#include +#include "hb_internal.hpp" #include #include "test_common.hpp" @@ -56,7 +54,7 @@ SISL_OPTION_GROUP( (read_verify, "", "read_verify", "Read and verify all data in long running tests", ::cxxopts::value< bool >()->default_value("false"), "true or false")); -SISL_OPTIONS_ENABLE(logging, test_common_setup, test_volume_io_setup, homeblocks, config) +SISL_OPTIONS_ENABLE(logging, test_common_setup, test_volume_io_setup, config) SISL_LOGGING_DECL(test_volume_io) std::unique_ptr< test_common::HBTestHelper > g_helper; @@ -105,12 +103,12 @@ class VolumeIOImpl { } private: - VolumeInfo gen_vol_info(uint32_t vol_idx) { - VolumeInfo vol_info; + volume_info gen_vol_info(uint32_t vol_idx) { + volume_info vol_info; vol_info.name = "vol_" + std::to_string(vol_idx); vol_info.size_bytes = SISL_OPTIONS["vol_size_gb"].as< uint32_t >() * Gi; vol_info.page_size = g_page_size; - vol_info.id = hb_utils::gen_random_uuid(); + vol_info.id = boost::uuids::random_generator()(); return vol_info; } @@ -120,23 +118,23 @@ class VolumeIOImpl { m_vol_name = vinfo.name; m_vol_id = vinfo.id; - auto vol_mgr = g_helper->inst()->volume_manager(); - auto ret = vol_mgr->create_volume(std::move(vinfo)).get(); + auto vol_mgr = g_helper->inst(); + auto ret = homeblocks::detail::sync_get(vol_mgr->create_volume(std::move(vinfo))); ASSERT_TRUE(ret); - m_vol_ptr = vol_mgr->lookup_volume(m_vol_id); + m_vol_ptr = vol_mgr->get_volume(m_vol_id).value_or(nullptr); ASSERT_TRUE(m_vol_ptr != nullptr); } void remove_volume() { - auto vol_mgr = g_helper->inst()->volume_manager(); - auto ret = vol_mgr->remove_volume(m_vol_id).get(); + auto vol_mgr = g_helper->inst(); + auto ret = homeblocks::detail::sync_get(vol_mgr->remove_volume(m_vol_id)); ASSERT_TRUE(ret); } void reset() { - auto vol_mgr = g_helper->inst()->volume_manager(); - m_vol_ptr = vol_mgr->lookup_volume(m_vol_id); + auto vol_mgr = g_helper->inst(); + m_vol_ptr = vol_mgr->get_volume(m_vol_id).value_or(nullptr); ASSERT_TRUE(m_vol_ptr != nullptr); } @@ -197,26 +195,34 @@ class VolumeIOImpl { return data; } + // Build a single-iovec sg_list over `buf` of `nbytes`. + static sisl::sg_list make_sgs(uint8_t* buf, uint64_t nbytes) { + return sisl::sg_list{.size = nbytes, .iovs = {iovec{buf, nbytes}}}; + } + + // Coroutine body for a single write IO; co_returns whether the write succeeded. data is taken by value so its + // buffer outlives the co_await. + sisl::async::task< bool > do_write_io_single(lba_t start_lba, uint32_t nblks, sisl::byte_array data) { + auto const result = co_await homeblocks::async_write(m_vol_ptr, start_lba * g_page_size, + make_sgs(data->bytes(), nblks * g_page_size)); + { + std::lock_guard lock(m_mutex); + m_inflight_ios.erase(boost::icl::interval< int >::closed(start_lba, start_lba + nblks - 1)); + } + co_return result.has_value(); + } + void generate_write_io_single(lba_t start_lba = 0, uint32_t nblks = 0, bool wait = true, bool expect_failure = false) { // Generate a single io with start lba and nblks. - auto latch = std::make_shared< std::latch >(1); auto data = build_random_data(start_lba, nblks, !expect_failure /*store the generated data*/); - vol_interface_req_ptr req(new vol_interface_req{data->bytes(), start_lba, nblks, m_vol_ptr}); - auto vol_mgr = g_helper->inst()->volume_manager(); - vol_mgr->write(m_vol_ptr, req) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, data, req, latch, expect_failure](auto&& result) { - ASSERT_EQ(!result.has_value(), expect_failure); - { - std::lock_guard lock(m_mutex); - m_inflight_ios.erase(boost::icl::interval< int >::closed(req->lba, req->lba + req->nlbas - 1)); - } - - latch->count_down(); - }); - - if (wait) { latch->wait(); } + if (wait) { + // Drive the write coroutine to completion on this (test) thread, then assert. + auto const succeeded = homeblocks::detail::sync_get(do_write_io_single(start_lba, nblks, data)); + ASSERT_EQ(succeeded, !expect_failure); + } else { + homeblocks::detail::detach(do_write_io_single(start_lba, nblks, data)); + } } auto generate_write_io_task(lba_t start_lba = 0, uint32_t nblks = 0) { @@ -237,31 +243,31 @@ class VolumeIOImpl { return m_read_runner->execute(); } + // Coroutine body for one runner-driven write IO; chains the next runner task on completion. + sisl::async::task< void > do_write_io(lba_t start_lba, uint32_t nblks, sisl::byte_array data) { + auto const result = co_await homeblocks::async_write(m_vol_ptr, start_lba * g_page_size, + make_sgs(data->bytes(), nblks * g_page_size)); + RELEASE_ASSERT(result.has_value(), "Write failed with error={}", result.error()); + { + std::lock_guard lock(m_mutex); + m_inflight_ios.erase(boost::icl::interval< int >::closed(start_lba, start_lba + nblks - 1)); + LOGDEBUG("end write io start={} end={}", start_lba, start_lba + nblks - 1); + } + m_write_count++; + m_write_runner->next_task(); + } + void generate_write_io(lba_t start_lba = 0, uint32_t nblks = 0) { auto data = build_random_data(start_lba, nblks); - vol_interface_req_ptr req(new vol_interface_req{data->bytes(), start_lba, nblks, m_vol_ptr}); - auto vol_mgr = g_helper->inst()->volume_manager(); - LOGDEBUG("begin write io start={} end={}", req->lba, req->lba + req->nlbas - 1); - vol_mgr->write(m_vol_ptr, req) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, req, data](auto&& result) { - RELEASE_ASSERT(result.has_value(), "Write failed with error={}", result.error()); - { - std::lock_guard lock(m_mutex); - m_inflight_ios.erase(boost::icl::interval< int >::closed(req->lba, req->lba + req->nlbas - 1)); - LOGDEBUG("end write io start={} end={}", req->lba, req->lba + req->nlbas - 1); - } - m_write_count++; - m_write_runner->next_task(); - }); + LOGDEBUG("begin write io start={} end={}", start_lba, start_lba + nblks - 1); + homeblocks::detail::detach(do_write_io(start_lba, nblks, data)); } void sync_read(lba_t start_lba, uint32_t nlbas) { auto sz = nlbas * m_vol_ptr->info()->page_size; sisl::io_blob_safe read_blob(sz, 512); - auto buf = read_blob.bytes(); - vol_interface_req_ptr req(new vol_interface_req{buf, start_lba, nlbas, m_vol_ptr}); - auto read_resp = g_helper->inst()->volume_manager()->read(m_vol_ptr, req).get(); + auto read_resp = homeblocks::detail::sync_get( + homeblocks::async_read(m_vol_ptr, start_lba * g_page_size, make_sgs(read_blob.bytes(), sz))); if (!read_resp.has_value()) { LOGERROR("Read failed with error={}", read_resp.error()); } RELEASE_ASSERT(read_resp.has_value(), "Read failed with error={}", read_resp.error()); } @@ -271,8 +277,8 @@ class VolumeIOImpl { auto sz = nlbas * m_vol_ptr->info()->page_size; sisl::io_blob_safe read_blob(sz, 512); auto buf = read_blob.bytes(); - vol_interface_req_ptr req(new vol_interface_req{buf, start_lba, nlbas, m_vol_ptr}); - auto read_resp = g_helper->inst()->volume_manager()->read(m_vol_ptr, req).get(); + auto read_resp = + homeblocks::detail::sync_get(homeblocks::async_read(m_vol_ptr, start_lba * g_page_size, make_sgs(buf, sz))); if (!read_resp.has_value()) { LOGERROR("Read failed with error={}", read_resp.error()); } RELEASE_ASSERT(read_resp.has_value(), "Read failed with error={}", read_resp.error()); auto read_sz = m_vol_ptr->info()->page_size; @@ -286,41 +292,38 @@ class VolumeIOImpl { } LOGDEBUG("Verify data lba={} pattern expected={} actual={}", lba, data_pattern, - *r_cast< uint64_t* >(read_blob.bytes())); + *reinterpret_cast< uint64_t* >(read_blob.bytes())); } } + // Coroutine body for one runner-driven read IO; read_blob is moved into the frame so its buffer (which + // req points into) outlives the co_await. Chains the next runner task on completion. + sisl::async::task< void > do_read_io(lba_t start_lba, uint32_t nlbas, sisl::io_blob_safe read_blob) { + auto const result = co_await homeblocks::async_read(m_vol_ptr, start_lba * g_page_size, + make_sgs(read_blob.bytes(), nlbas * g_page_size)); + RELEASE_ASSERT(result.has_value(), "Read failed with error={}", result.error()); + { + std::lock_guard lock(m_mutex); + m_inflight_ios.erase(boost::icl::interval< int >::closed(start_lba, start_lba + nlbas - 1)); + LOGDEBUG("end read io start={} end={}", start_lba, start_lba + nlbas - 1); + } + m_read_count++; + m_read_runner->next_task(); + } + void generate_read_io(lba_t start_lba = 0, uint32_t nlbas = 0) { auto info = m_vol_ptr->info(); uint64_t page_size = info->page_size; uint64_t max_blks = static_cast< uint64_t >(info->size_bytes) / page_size; get_random_non_overlapping_lba(start_lba, nlbas, max_blks); sisl::io_blob_safe read_blob(nlbas * page_size, 512); - auto buf = read_blob.bytes(); - vol_interface_req_ptr req(new vol_interface_req{buf, start_lba, nlbas, m_vol_ptr}); - LOGDEBUG("begin read io start={} end={}", req->lba, req->lba + req->nlbas - 1); - auto read_resp = - g_helper->inst() - ->volume_manager() - ->read(m_vol_ptr, req) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, read_blob = std::move(read_blob), req](auto&& result) { - RELEASE_ASSERT(result.has_value(), "Read failed with error={}", result.error()); - { - std::lock_guard lock(m_mutex); - m_inflight_ios.erase(boost::icl::interval< int >::closed(req->lba, req->lba + req->nlbas - 1)); - LOGDEBUG("end read io start={} end={}", req->lba, req->lba + req->nlbas - 1); - } - m_read_count++; - m_read_runner->next_task(); - }); + LOGDEBUG("begin read io start={} end={}", start_lba, start_lba + nlbas - 1); + homeblocks::detail::detach(do_read_io(start_lba, nlbas, std::move(read_blob))); } void verify_all_data(uint64_t nlbas_per_io = 1) { - auto start_lba = m_lba_data.begin()->first; - auto end_lba = m_lba_data.rbegin()->first; - LOGTRACE("Verifying data for volume {} from lba={} to lba={} with nlbas_per_io={}", m_vol_name, start_lba, - end_lba, nlbas_per_io); + LOGTRACE("Verifying data for volume {} from lba={} to lba={} with nlbas_per_io={}", m_vol_name, + m_lba_data.begin()->first, m_lba_data.rbegin()->first, nlbas_per_io); for (auto& [lba, _] : m_lba_data) { read_and_verify(lba, nlbas_per_io); } @@ -342,7 +345,7 @@ class VolumeIOImpl { private: std::mutex m_mutex; std::string m_vol_name; - VolumePtr m_vol_ptr; + volume_handle m_vol_ptr; volume_id_t m_vol_id; static inline uint32_t m_volume_id_{1}; // Mapping from lba to data patttern. @@ -394,7 +397,7 @@ class VolumeIOTest : public ::testing::Test { auto generate_write_io(shared< VolumeIOImpl > vol = nullptr, lba_t start_lba = 0, uint32_t nblks = 0, bool wait = true) { // Generate write io based on num_io and qdepth with start lba and nblks. - std::vector< folly::Future< folly::Unit > > futs; + std::vector< std::future< void > > futs; if (vol != nullptr) { futs.emplace_back(vol->generate_write_io_task(start_lba, nblks)); } else { @@ -404,7 +407,7 @@ class VolumeIOTest : public ::testing::Test { } if (wait) { - folly::collectAll(futs).get(); + test_common::wait_all(futs); LOGINFO("Write IO completed count={}", get_total_writes()); } @@ -414,7 +417,7 @@ class VolumeIOTest : public ::testing::Test { auto generate_read_io(shared< VolumeIOImpl > vol = nullptr, lba_t start_lba = 0, uint32_t nblks = 0, bool wait = true) { // Generate read io based on num_io and qdepth with start lba and nblks. - std::vector< folly::Future< folly::Unit > > futs; + std::vector< std::future< void > > futs; if (vol != nullptr) { futs.emplace_back(vol->generate_read_io_task(start_lba, nblks)); } else { @@ -424,7 +427,7 @@ class VolumeIOTest : public ::testing::Test { } if (wait) { - folly::collectAll(futs).get(); + test_common::wait_all(futs); LOGINFO("Read IO completed count={}", get_total_reads()); } @@ -589,7 +592,7 @@ TEST_F(VolumeIOTest, LongRunningRandomIO) { uint64_t total_reads{0}, total_writes{0}; auto start_time = std::chrono::high_resolution_clock::now(); do { - std::vector< folly::Future< folly::Unit > > futs; + std::vector< std::future< void > > futs; // Generate write's on all volumes with random lba and nblks on all volumes. auto writes = generate_write_io(nullptr /* vol */, 0 /* start_lba */, 0 /* nblks */, false /* wait */); @@ -599,7 +602,7 @@ TEST_F(VolumeIOTest, LongRunningRandomIO) { futs.insert(futs.end(), std::make_move_iterator(writes.begin()), std::make_move_iterator(writes.end())); futs.insert(futs.end(), std::make_move_iterator(reads.begin()), std::make_move_iterator(reads.end())); - folly::collectAll(futs).get(); + test_common::wait_all(futs); total_reads += get_total_reads(); total_writes += get_total_writes(); @@ -645,15 +648,15 @@ TEST_F(VolumeIOTest, LongRunningSequentialIO) { lba_count_t nblks = 100; uint64_t total_reads{0}, total_writes{0}; do { - std::vector< folly::Future< folly::Unit > > futs; + std::vector< std::future< void > > futs; // Generate write's on all volumes with sequential lba and nblks on all volumes. auto writes = generate_write_io(nullptr /* vol */, cur_lba /* start_lba */, nblks, false /* wait */); - folly::collectAll(writes).get(); + test_common::wait_all(writes); // Generate reads on all volumes with sequential lba and nblks on all volumes. auto reads = generate_read_io(nullptr /* vol */, cur_lba /* start_lba */, nblks, false /* wait */); - folly::collectAll(reads).get(); + test_common::wait_all(reads); total_reads += get_total_reads(); total_writes += get_total_writes(); @@ -693,12 +696,12 @@ TEST_F(VolumeIOTest, PerfRandomIo) { auto start_time = std::chrono::high_resolution_clock::now(); do { - std::vector< folly::Future< folly::Unit > > futs; + std::vector< std::future< void > > futs; // Generate write's on all volumes with random lba and nblks on all volumes. auto ios = generate_random_io(nullptr /* vol */, 0 /* start_lba */, 0 /* nblks */, false /* wait */); futs.insert(futs.end(), std::make_move_iterator(ios.begin()), std::make_move_iterator(ios.end())); - folly::collectAll(futs).get(); + test_common::wait_all(futs); std::chrono::duration< double > elapsed = std::chrono::high_resolution_clock::now() - start_time; auto elapsed_seconds = static_cast< uint64_t >(elapsed.count()); static uint64_t log_pct = 0; @@ -725,11 +728,11 @@ TEST_F(VolumeIOTest, PerfSequentialIo) { lba_count_t nblks; do { get_random_nblks(nblks); - std::vector< folly::Future< folly::Unit > > futs; + std::vector< std::future< void > > futs; // Generate write's on all volumes with sequential lba and nblks on all volumes. auto ios = generate_random_io(nullptr /* vol */, cur_lba /* start_lba */, nblks, false /* wait */); - folly::collectAll(ios).get(); + test_common::wait_all(ios); std::chrono::duration< double > elapsed = std::chrono::high_resolution_clock::now() - start_time; auto elapsed_seconds = static_cast< uint64_t >(elapsed.count()); @@ -822,10 +825,8 @@ int main(int argc, char* argv[]) { } ::testing::InitGoogleTest(&parsed_argc, argv); - SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_common_setup, test_volume_io_setup, homeblocks, config); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_common_setup, test_volume_io_setup, config); spdlog::set_pattern("[%D %T%z] [%^%l%$] [%n] [%t] %v"); - parsed_argc = 1; - auto f = ::folly::Init(&parsed_argc, &argv, true); g_helper = std::make_unique< test_common::HBTestHelper >("test_volume_io", args, orig_argv); g_helper->setup(); diff --git a/src/lib/volume/volume.cpp b/src/lib/volume/volume.cpp index 6ec3f75..fe2a2a6 100644 --- a/src/lib/volume/volume.cpp +++ b/src/lib/volume/volume.cpp @@ -15,18 +15,13 @@ *********************************************************************************/ #include "volume.hpp" #include "lib/homeblks_impl.hpp" +#include "coro_helpers.hpp" #include #include namespace homeblocks { -static VolumeError to_volume_error(std::error_code ec) { - switch (ec.value()) { - default: - return VolumeError::UNKNOWN; - } -} -uint64_t Volume::get_index_size() { +uint64_t volume::get_index_size() { // Get approximate index size based on volume size. additional space for interior nodes. const int32_t index_kv_size = 32; uint64_t index_size = (vol_info_->size_bytes / vol_info_->page_size) * index_kv_size * 3; @@ -34,14 +29,14 @@ uint64_t Volume::get_index_size() { } // this API will be called by volume manager after volume sb is recovered and volume is created; -shared< VolumeIndexTable > Volume::init_index_table(bool is_recovery, shared< VolumeIndexTable > tbl) { +shared< VolumeIndexTable > volume::init_index_table(bool is_recovery, shared< VolumeIndexTable > tbl) { if (!is_recovery) { index_cfg_t cfg(homestore::hs()->index_service().node_size()); cfg.m_leaf_node_type = btree_leaf_node_type; cfg.m_int_node_type = btree_int_node_type; // create index table; - auto uuid = hb_utils::gen_random_uuid(); + auto uuid = boost::uuids::random_generator()(); // user_sb_size is not currently enabled in homestore; // parent uuid is used during recovery in homeblks layer; @@ -67,20 +62,20 @@ shared< VolumeIndexTable > Volume::init_index_table(bool is_recovery, shared< Vo return indx_table(); } -Volume::Volume(sisl::byte_view const& buf, void* cookie, shared< VolumeChunkSelector > vol_chunk_sel, +volume::volume(sisl::byte_view const& buf, void* cookie, shared< VolumeChunkSelector > vol_chunk_sel, shared< VolumeChunkSelector > index_chunk_sel) : sb_{VOL_META_NAME}, volume_chunk_selector_{vol_chunk_sel}, index_chunk_selector_{index_chunk_sel} { - sb_.load(buf, cookie); + sb_.load(buf, static_cast< homestore::meta_blk* >(cookie)); // generate volume info from sb; - vol_info_ = std::make_shared< VolumeInfo >(sb_->id, sb_->size, sb_->page_size, sb_->name, sb_->ordinal); + vol_info_ = std::make_shared< volume_info >(sb_->id, sb_->size, sb_->page_size, sb_->name, sb_->ordinal); metrics_ = std::make_unique< VolumeMetrics >(vol_info_->name); m_state_ = sb_->state; - LOGI("Volume superblock loaded from disk, vol_info : {}", vol_info_->to_string()); + LOGI("volume superblock loaded from disk, vol_info : {}", vol_info_->to_string()); } -bool Volume::init(bool is_recovery) { +bool volume::init(bool is_recovery) { if (!is_recovery) { - // first time creation of the Volume, let's write the superblock; + // first time creation of the volume, let's write the superblock; // Allocate initial set of chunks for the volume with thin provisioning. uint32_t pdev_id; @@ -100,8 +95,10 @@ bool Volume::init(bool is_recovery) { // 1. create solo repl dev for volume; // members left empty on purpose for solo repl dev LOGI("Creating solo repl dev for volume: {}, uuid: {}", vol_info_->name, boost::uuids::to_string(id())); - auto ret = homestore::hs()->repl_service().create_repl_dev(id(), {} /*members*/).get(); - if (ret.hasError()) { + // create_repl_dev now returns a coroutine (async_result); drive it synchronously here (init is a + // synchronous control-plane call) and inspect the unified result. + auto ret = detail::sync_get(homestore::hs()->repl_service().create_repl_dev(id(), {} /*members*/)); + if (!ret.has_value()) { LOGE("Failed to create solo repl dev for volume: {}, uuid: {}, error: {}", vol_info_->name, boost::uuids::to_string(vol_info_->id), ret.error()); @@ -116,7 +113,7 @@ bool Volume::init(bool is_recovery) { } // 3. mark state as online; - state_change(vol_state::ONLINE); + state_change(volume_state::ONLINE); LOGI("Created volume: {} uuid: {} ordinal: {} size: {} pdev: {} num_chunks: {}", vol_info_->name, boost::uuids::to_string(vol_info_->id), vol_info_->ordinal, vol_info_->size_bytes, pdev_id, @@ -126,8 +123,8 @@ bool Volume::init(bool is_recovery) { LOGI("Getting repl dev for volume: {}, uuid: {}", vol_info_->name, boost::uuids::to_string(id())); auto ret = homestore::hs()->repl_service().get_repl_dev(id()); - if (ret.hasError()) { - LOGI("Volume in destroying state? Failed to get repl dev for volume name: {}, uuid: {}, error: {}", + if (!ret.has_value()) { + LOGI("volume in destroying state? Failed to get repl dev for volume name: {}, uuid: {}, error: {}", vol_info_->name, boost::uuids::to_string(vol_info_->id), ret.error()); rd_ = nullptr; // DEBUG_ASSERT(false, "Failed to get repl dev for volume"); @@ -157,14 +154,16 @@ bool Volume::init(bool is_recovery) { return true; } -void Volume::destroy() { +sisl::async::task< void > volume::destroy() { LOGI("Start destroying volume: {}, uuid: {}", vol_info_->name, boost::uuids::to_string(id())); destroy_started_ = true; // 1. destroy the repl dev; if (rd_) { LOGI("Destroying repl dev for volume: {}", vol_info_->name); - homestore::hs()->repl_service().remove_repl_dev(id()).get(); + // remove_repl_dev is a coroutine (async_status); co_await it (rather than a blocking sync_get) so this + // path can run on an iomgr reactor without parking it. Best-effort during destroy, result ignored. + (void)co_await homestore::hs()->repl_service().remove_repl_dev(id()); rd_ = nullptr; } @@ -172,8 +171,8 @@ void Volume::destroy() { if (iomgr_flip::instance()->test_flip("vol_destroy_crash_simulation")) { // this is to simulate crash during volume destroy; // volume should be able to resume destroy on next reboot; - LOGINFO("Volume destroy crash simulation flip is set, aborting"); - return; + LOGINFO("volume destroy crash simulation flip is set, aborting"); + co_return; } #endif @@ -181,7 +180,7 @@ void Volume::destroy() { if (indx_tbl_) { LOGI("Destroying index table for volume: {}, uuid: {}", vol_info_->name, boost::uuids::to_string(id())); // table superblk deletes in destroy(), hence it is safe to release chunks afterwards - indx_tbl_->destroy(); + co_await indx_tbl_->destroy(); index_chunk_selector_->release_chunks(vol_info_->ordinal); indx_tbl_ = nullptr; } @@ -194,7 +193,7 @@ void Volume::destroy() { volume_chunk_selector_->release_chunks(vol_info_->ordinal); } -void Volume::update_vol_sb_cb(const std::vector< chunk_num_t >& chunk_ids) { +void volume::update_vol_sb_cb(const std::vector< chunk_num_t >& chunk_ids) { // Update the volume superblk with latest set of chunk id's. uint32_t pdev_id = sb_->pdev_id; sb_.resize(sizeof(vol_sb_t) + (chunk_ids.size() * sizeof(homestore::chunk_num_t))); @@ -203,177 +202,174 @@ void Volume::update_vol_sb_cb(const std::vector< chunk_num_t >& chunk_ids) { sb_.write(); } -VolumeManager::NullAsyncResult Volume::write(const vol_interface_req_ptr& vol_req) { - vol_req->io_start_time = Clock::now(); +async_status volume::write(io_req& vol_req) { + vol_req.io_start_time = sisl::Clock::now(); // Step 1. Allocate new blkids. Homestore might return multiple blkid's pointing // to different contigious memory locations. - auto data_size = vol_req->nlbas * rd()->get_blk_size(); + auto data_size = vol_req.nlbas * rd()->get_blk_size(); homestore::blk_alloc_hints hints; hints.application_hint = vol_info_->ordinal; - std::vector< homestore::MultiBlkId > new_blkids; - auto result = rd()->alloc_blks(data_size, hints, new_blkids); - if (result) { + std::vector< homestore::multi_blk_id > new_blkids; + // alloc_blks now returns homestore::status (std::expected); a value means success (was an error_code where + // truthy meant failure -- hence the flipped check). + if (auto const alloc_res = rd()->alloc_blks(data_size, hints, new_blkids); !alloc_res) { LOGE("Failed to allocate blocks data_size={}", data_size); - return std::unexpected(VolumeError::NO_SPACE_LEFT); + co_return std::unexpected(std::errc::no_space_on_device); } COUNTER_INCREMENT(*metrics_, volume_write_count, 1); // Step 2. Write the data to those allocated blkids. - vol_req->data_svc_start_time = Clock::now(); + vol_req.data_svc_start_time = sisl::Clock::now(); sisl::sg_list data_sgs; - data_sgs.iovs.emplace_back(iovec{.iov_base = vol_req->buffer, .iov_len = data_size}); + data_sgs.iovs.emplace_back(iovec{.iov_base = vol_req.buffer, .iov_len = data_size}); data_sgs.size = data_size; - return rd() - ->async_write(new_blkids, data_sgs, vol_req->part_of_batch) - .thenValue([this, vol_req, - new_blkids = std::move(new_blkids)](auto&& result) -> VolumeManager::NullAsyncResult { - if (result) { return std::unexpected(VolumeError::DRIVE_WRITE_ERROR); } - HISTOGRAM_OBSERVE(*metrics_, volume_data_write_latency, get_elapsed_time_us(vol_req->data_svc_start_time)); - vol_req->index_start_time = Clock::now(); - using homestore::BlkId; - std::vector< BlkId > old_blkids; - std::unordered_map< lba_t, BlockInfo > blocks_info; - auto blk_size = rd()->get_blk_size(); - auto data_size = vol_req->nlbas * blk_size; - auto data_buffer = vol_req->buffer; - lba_t start_lba = vol_req->lba; - for (auto& blkid : new_blkids) { - DEBUG_ASSERT_EQ(blkid.num_pieces(), 1, "Multiple blkid pieces"); - LOGT("volume write blkid={} start_lba={}", blkid.to_string(), start_lba); - - // Split the large blkid to individual blkid having only one block because each LBA points - // to a blkid containing single blk which is stored in index value. Calculate the checksum for each - // block which is also stored in index. - for (uint32_t i = 0; i < blkid.blk_count(); i++) { - auto new_bid = BlkId{blkid.blk_num() + i, 1 /* nblks */, blkid.chunk_num()}; - auto csum = crc16_t10dif(init_crc_16, static_cast< unsigned char* >(data_buffer), blk_size); - blocks_info.emplace(start_lba + i, BlockInfo{new_bid, BlkId{}, csum}); - LOGT("volume write blkid={} csum={} start_lba={} lba={}", new_bid.to_string(), - blocks_info[start_lba + i].new_checksum, start_lba, start_lba + i); - data_buffer += blk_size; - } - - // Step 3. For range [start_lba, end_lba] in this blkid, write the values to index. - // Should there be any overwritten on existing lbas, old blocks to be freed will be collected - // in blocks_info after write_to_index - lba_t end_lba = start_lba + blkid.blk_count() - 1; - auto status = indx_table()->write_to_index(start_lba, end_lba, blocks_info); - if (!status) { return std::unexpected(VolumeError::INDEX_ERROR); } - - start_lba = end_lba + 1; - } - HISTOGRAM_OBSERVE(*metrics_, volume_map_write_latency, get_elapsed_time_us(vol_req->index_start_time)); - - vol_req->journal_start_time = Clock::now(); - // Collect all old blocks to write to journal. - for (auto& [_, info] : blocks_info) { - if (info.old_blkid.is_valid()) { - LOGT("volume write start_lba={} old blkids={}", vol_req->lba, info.old_blkid.to_string()); - old_blkids.emplace_back(info.old_blkid); - } - } - - auto csum_size = sizeof(homestore::csum_t) * vol_req->nlbas; - auto old_blkids_size = sizeof(BlkId) * old_blkids.size(); - auto key_size = sizeof(VolJournalEntry) + csum_size + old_blkids_size; - - auto req = - repl_result_ctx< VolumeManager::NullResult >::make(sizeof(MsgHeader) /* header size */, key_size); - req->vol_ptr_ = shared_from_this(); - req->header()->msg_type = MsgType::WRITE; - // Store volume id for recovery path (log replay) - req->header()->volume_id = id(); - - // Step 4. Store lba, nlbas, list of checksum of each blk, list of old blkids as key in the journal. - // New blkid's are written to journal by the homestore async_write_journal. After journal flush, - // on_commit will be called where we free the old blkid's and the write iscompleted. - VolJournalEntry hb_key{vol_req->lba, vol_req->nlbas, static_cast< uint16_t >(old_blkids.size())}; - auto key_buf = req->key_buf().bytes(); - std::memcpy(key_buf, &hb_key, sizeof(VolJournalEntry)); - key_buf += sizeof(VolJournalEntry); - - auto lba = vol_req->lba; - for (lba_count_t count = 0; count < vol_req->nlbas; count++) { - std::memcpy(key_buf, &blocks_info[lba].new_checksum, sizeof(homestore::csum_t)); - key_buf += sizeof(homestore::csum_t); - lba++; - } - - for (auto& blkid : old_blkids) { - std::memcpy(key_buf, &blkid, sizeof(BlkId)); - key_buf += sizeof(BlkId); - } + // NOTE: v8 io_batch is a reactor-local RAII scope, not the old cross-call part_of_batch accumulator, so we + // issue the op un-batched (see HomeBlocksImpl::submit_io_batch()). + if (auto const wr = co_await rd()->async_write(new_blkids, data_sgs, nullptr); !wr) { + co_return std::unexpected(std::errc::io_error); + } + HISTOGRAM_OBSERVE(*metrics_, volume_data_write_latency, get_elapsed_time_us(vol_req.data_svc_start_time)); + vol_req.index_start_time = sisl::Clock::now(); + using homestore::blk_id; + std::vector< blk_id > old_blkids; + std::unordered_map< lba_t, BlockInfo > blocks_info; + auto blk_size = rd()->get_blk_size(); + auto data_buffer = vol_req.buffer; + lba_t start_lba = vol_req.lba; + for (auto& blkid : new_blkids) { + DEBUG_ASSERT_EQ(blkid.num_pieces(), 1, "Multiple blkid pieces"); + LOGT("volume write blkid={} start_lba={}", blkid.to_string(), start_lba); + + // Split the large blkid to individual blkid having only one block because each LBA points + // to a blkid containing single blk which is stored in index value. Calculate the checksum for each + // block which is also stored in index. + for (uint32_t i = 0; i < blkid.blk_count(); i++) { + auto new_bid = blk_id{blkid.blk_num() + i, 1 /* nblks */, blkid.chunk_num()}; + auto csum = crc16_t10dif(init_crc_16, static_cast< unsigned char* >(data_buffer), blk_size); + blocks_info.emplace(start_lba + i, BlockInfo{new_bid, blk_id{}, csum}); + LOGT("volume write blkid={} csum={} start_lba={} lba={}", new_bid.to_string(), + blocks_info[start_lba + i].new_checksum, start_lba, start_lba + i); + data_buffer += blk_size; + } + + // Step 3. For range [start_lba, end_lba] in this blkid, write the values to index. + // Should there be any overwritten on existing lbas, old blocks to be freed will be collected + // in blocks_info after write_to_index + lba_t end_lba = start_lba + blkid.blk_count() - 1; + if (auto const idx_res = indx_table()->write_to_index(start_lba, end_lba, blocks_info); !idx_res) { + co_return std::unexpected(volume_error::INDEX_ERROR); + } + + start_lba = end_lba + 1; + } + HISTOGRAM_OBSERVE(*metrics_, volume_map_write_latency, get_elapsed_time_us(vol_req.index_start_time)); + + vol_req.journal_start_time = sisl::Clock::now(); + // Collect all old blocks to write to journal. + for (auto& [_, info] : blocks_info) { + if (info.old_blkid.is_valid()) { + LOGT("volume write start_lba={} old blkids={}", vol_req.lba, info.old_blkid.to_string()); + old_blkids.emplace_back(info.old_blkid); + } + } + + auto csum_size = sizeof(homestore::csum_t) * vol_req.nlbas; + auto old_blkids_size = sizeof(blk_id) * old_blkids.size(); + auto key_size = sizeof(VolJournalEntry) + csum_size + old_blkids_size; + + auto req = repl_result_ctx< status >::make(sizeof(MsgHeader) /* header size */, key_size); + req->vol_ptr_ = shared_from_this(); + req->header()->msg_type = MsgType::WRITE; + // Store volume id for recovery path (log replay) + req->header()->volume_id = id(); + + // Step 4. Store lba, nlbas, list of checksum of each blk, list of old blkids as key in the journal. + // New blkid's are written to journal by the homestore async_write_journal. After journal flush, + // on_commit will be called where we free the old blkid's and the write iscompleted. + VolJournalEntry hb_key{vol_req.lba, vol_req.nlbas, static_cast< uint16_t >(old_blkids.size())}; + auto key_buf = req->key_buf().bytes(); + std::memcpy(key_buf, &hb_key, sizeof(VolJournalEntry)); + key_buf += sizeof(VolJournalEntry); + + auto lba = vol_req.lba; + for (lba_count_t count = 0; count < vol_req.nlbas; count++) { + std::memcpy(key_buf, &blocks_info[lba].new_checksum, sizeof(homestore::csum_t)); + key_buf += sizeof(homestore::csum_t); + lba++; + } + + for (auto& blkid : old_blkids) { + std::memcpy(key_buf, &blkid, sizeof(blk_id)); + key_buf += sizeof(blk_id); + } #ifdef _PRERELEASE - if (iomgr_flip::instance()->test_flip("vol_write_crash_after_data_write")) { - // this is to simulate crash during write where data is persisted journal is - // not persisted. After recovery there is no index for. - LOGINFO("Volume write crash simulation flip is set, aborting"); - return VolumeManager::NullResult(); - } + if (iomgr_flip::instance()->test_flip("vol_write_crash_after_data_write")) { + // this is to simulate crash during write where data is persisted journal is + // not persisted. After recovery there is no index for. + LOGINFO("volume write crash simulation flip is set, aborting"); + co_return status(); + } #endif - rd()->async_write_journal(new_blkids, req->cheader_buf(), req->ckey_buf(), data_size, req); - - return req->result() - .via(&folly::InlineExecutor::instance()) - .thenValue([this, vol_req](const auto&& result) -> std::expected< void, VolumeError > { - if (!result.has_value()) { - LOGE("Failed to write to journal for volume: {}, lba: {}, nlbas: {}, error: {}", - vol_info_->name, vol_req->lba, vol_req->nlbas, result.error()); - auto err = result.error(); - return std::unexpected(err); - } - HISTOGRAM_OBSERVE(*metrics_, volume_journal_write_latency, - get_elapsed_time_us(vol_req->journal_start_time)); - auto write_size = vol_req->nlbas * rd()->get_blk_size(); - COUNTER_INCREMENT(*metrics_, volume_write_size_total, write_size); - HISTOGRAM_OBSERVE(*metrics_, volume_write_size_distribution, write_size); - HISTOGRAM_OBSERVE(*metrics_, volume_write_latency, get_elapsed_time_us(vol_req->io_start_time)); - return {}; - }); - }); + rd()->async_write_journal(new_blkids, req->cheader_buf(), req->ckey_buf(), data_size, req); + + // Wait for the journal flush -> on_commit -> HomeBlocksImpl::on_write completion. The result is delivered + // cross-thread via the repl_result_ctx's value_awaitable; we resume inline + // on the commit thread. + auto const jres = co_await req->promise_; + if (!jres.has_value()) { + LOGE("Failed to write to journal for volume: {}, lba: {}, nlbas: {}, error: {}", vol_info_->name, vol_req.lba, + vol_req.nlbas, jres.error()); + co_return std::unexpected(jres.error()); + } + HISTOGRAM_OBSERVE(*metrics_, volume_journal_write_latency, get_elapsed_time_us(vol_req.journal_start_time)); + auto write_size = vol_req.nlbas * rd()->get_blk_size(); + COUNTER_INCREMENT(*metrics_, volume_write_size_total, write_size); + HISTOGRAM_OBSERVE(*metrics_, volume_write_size_distribution, write_size); + HISTOGRAM_OBSERVE(*metrics_, volume_write_latency, get_elapsed_time_us(vol_req.io_start_time)); + co_return homeblocks::ok(); } -VolumeManager::NullAsyncResult Volume::read(const vol_interface_req_ptr& req) { - req->io_start_time = Clock::now(); +async_status volume::read(io_req& req) { + req.io_start_time = sisl::Clock::now(); // Step 1: get the blk ids from index table - vol_read_ctx read_ctx{.vol_req = req, .blk_size = rd()->get_blk_size()}; - if (auto index_resp = indx_table()->read_from_index(req, read_ctx.index_kvs); !index_resp.has_value()) { - LOGE("Failed to read from index table for range=[{}, {}], volume id: {}, error: {}", req->lba, req->end_lba(), + vol_read_ctx read_ctx{.req = &req, .blk_size = rd()->get_blk_size()}; + if (auto index_resp = indx_table()->read_from_index(req.lba, req.end_lba(), read_ctx.index_kvs); + !index_resp.has_value()) { + LOGE("Failed to read from index table for range=[{}, {}], volume id: {}, error: {}", req.lba, req.end_lba(), boost::uuids::to_string(id()), index_resp.error()); - return VolumeManager::NullResult(); + co_return status(); } - HISTOGRAM_OBSERVE(*metrics_, volume_map_read_latency, get_elapsed_time_us(req->io_start_time)); + HISTOGRAM_OBSERVE(*metrics_, volume_map_read_latency, get_elapsed_time_us(req.io_start_time)); COUNTER_INCREMENT(*metrics_, volume_read_count, 1); // Step 2: Consolidate the blocks by merging the contiguous blkids - std::vector< folly::Future< std::error_code > > futs; + std::vector< sisl::async::task< iomgr::io_result > > futs; + // async_read takes sisl::sg_list by reference and the returned task is lazy (consumed only when when_all + // starts it). The sg_lists must therefore outlive the co_await below, so we keep them here (stable + // addresses via unique_ptr) rather than in submit_read_to_backend's per-iteration scope. + std::vector< std::unique_ptr< sisl::sg_list > > sgs_keepalive; read_blks_list_t blks_to_read; generate_blkids_to_read(read_ctx.index_kvs, blks_to_read); // Step 3: Submit the read requests to backend - req->data_svc_start_time = Clock::now(); - submit_read_to_backend(blks_to_read, req, futs); - - if (read_ctx.index_kvs.empty()) { return VolumeManager::NullResult(); } - - // Step 4: verify the checksum after all the reads are done - return folly::collectAllUnsafe(futs).thenValue([this, read_ctx](auto&& vf) -> VolumeManager::NullResult { - for (auto const& err_c : vf) { - if (sisl_unlikely(err_c.value())) { - auto ec = err_c.value(); - return std::unexpected(to_volume_error(ec)); - } - } - HISTOGRAM_OBSERVE(*metrics_, volume_data_read_latency, - get_elapsed_time_us(read_ctx.vol_req->data_svc_start_time)); - // verify the checksum and return - return verify_checksum(read_ctx); - }); + req.data_svc_start_time = sisl::Clock::now(); + submit_read_to_backend(blks_to_read, req, futs, sgs_keepalive); + + if (read_ctx.index_kvs.empty()) { co_return status(); } + + // Step 4: wait for all the reads, then verify the checksum. + auto const results = co_await sisl::async::when_all(std::move(futs)); + for (auto const& r : results) { + if (sisl_unlikely(!r.has_value())) { co_return std::unexpected(r.error()); } + } + HISTOGRAM_OBSERVE(*metrics_, volume_data_read_latency, get_elapsed_time_us(read_ctx.req->data_svc_start_time)); + // verify the checksum and return + co_return verify_checksum(read_ctx); } -void Volume::generate_blkids_to_read(const index_kv_list_t& index_kvs, read_blks_list_t& blks_to_read) { +void volume::generate_blkids_to_read(const index_kv_list_t& index_kvs, read_blks_list_t& blks_to_read) { for (uint32_t i = 0, start_idx = 0; i < index_kvs.size(); ++i) { auto const& [key, value] = index_kvs[i]; bool is_contiguous = (i == 0 || @@ -391,19 +387,19 @@ void Volume::generate_blkids_to_read(const index_kv_list_t& index_kvs, read_blks // we need to account for it in the blk_count auto blk_count = is_contiguous ? (i - start_idx + 1) : (i - start_idx); blks_to_read.emplace_back(index_kvs[start_idx].first.lba(), - homestore::MultiBlkId(blk_num, blk_count, chunk_num)); + homestore::multi_blk_id(blk_num, blk_count, chunk_num)); start_idx = i; if (!is_contiguous && i == index_kvs.size() - 1) { // if the last entry is not contiguous, we need to add it as well blks_to_read.emplace_back(key.lba(), - homestore::MultiBlkId(value.blkid().blk_num(), 1, value.blkid().chunk_num())); + homestore::multi_blk_id(value.blkid().blk_num(), 1, value.blkid().chunk_num())); } } } -VolumeManager::NullResult Volume::verify_checksum(vol_read_ctx const& read_ctx) { - auto read_buf = read_ctx.vol_req->buffer; - for (uint64_t cur_lba = read_ctx.vol_req->lba, i = 0; i < read_ctx.index_kvs.size();) { +status volume::verify_checksum(vol_read_ctx const& read_ctx) { + auto read_buf = read_ctx.req->buffer; + for (uint64_t cur_lba = read_ctx.req->lba, i = 0; i < read_ctx.index_kvs.size();) { auto const& [key, value] = read_ctx.index_kvs[i]; // ignore the holes if (cur_lba != key.lba()) { @@ -411,31 +407,31 @@ VolumeManager::NullResult Volume::verify_checksum(vol_read_ctx const& read_ctx) cur_lba = key.lba(); continue; } - DEBUG_ASSERT_EQ(read_buf - read_ctx.vol_req->buffer, (cur_lba - read_ctx.vol_req->lba) * read_ctx.blk_size, + DEBUG_ASSERT_EQ(read_buf - read_ctx.req->buffer, (cur_lba - read_ctx.req->lba) * read_ctx.blk_size, "Read buffer size mismatch, expected: {}, actual: {}", - (cur_lba - read_ctx.vol_req->lba) * read_ctx.blk_size, read_buf - read_ctx.vol_req->buffer); + (cur_lba - read_ctx.req->lba) * read_ctx.blk_size, read_buf - read_ctx.req->buffer); auto checksum = crc16_t10dif(init_crc_16, static_cast< unsigned char* >(read_buf), read_ctx.blk_size); if (checksum != value.checksum()) { LOGE("crc mismatch for lba: {} start: {}, end: {} blk id {}, expected: {}, actual: {}", cur_lba, - read_ctx.vol_req->lba, read_ctx.vol_req->end_lba(), value.blkid().to_string(), value.checksum(), - checksum); - return std::unexpected(VolumeError::CRC_MISMATCH); + read_ctx.req->lba, read_ctx.req->end_lba(), value.blkid().to_string(), value.checksum(), checksum); + return std::unexpected(volume_error::CRC_MISMATCH); } read_buf += read_ctx.blk_size; ++i; ++cur_lba; } - auto read_size = read_ctx.vol_req->nlbas * read_ctx.blk_size; + auto read_size = read_ctx.req->nlbas * read_ctx.blk_size; COUNTER_INCREMENT(*metrics_, volume_read_size_total, read_size); HISTOGRAM_OBSERVE(*metrics_, volume_read_size_distribution, read_size); - HISTOGRAM_OBSERVE(*metrics_, volume_read_latency, get_elapsed_time_us(read_ctx.vol_req->io_start_time)); + HISTOGRAM_OBSERVE(*metrics_, volume_read_latency, get_elapsed_time_us(read_ctx.req->io_start_time)); return {}; } -void Volume::submit_read_to_backend(read_blks_list_t const& blks_to_read, const vol_interface_req_ptr& req, - std::vector< folly::Future< std::error_code > >& futs) { - auto* read_buf = req->buffer; +void volume::submit_read_to_backend(read_blks_list_t const& blks_to_read, const io_req& req, + std::vector< sisl::async::task< iomgr::io_result > >& futs, + std::vector< std::unique_ptr< sisl::sg_list > >& sgs_keepalive) { + auto* read_buf = req.buffer; auto inst = HomeBlocksImpl::instance(); if (read_buf == nullptr && inst->fc_on()) { @@ -444,7 +440,7 @@ void Volume::submit_read_to_backend(read_blks_list_t const& blks_to_read, const } else { RELEASE_ASSERT(read_buf != nullptr, "Read buffer is null"); } - uint32_t prev_lba = req->lba; + uint32_t prev_lba = req.lba; uint32_t prev_nblks = 0; for (uint32_t i = 0; i < blks_to_read.size(); ++i) { auto const& [start_lba, blkids] = blks_to_read[i]; @@ -456,28 +452,31 @@ void Volume::submit_read_to_backend(read_blks_list_t const& blks_to_read, const std::memset(read_buf, 0, holes_size); read_buf += holes_size; } - DEBUG_ASSERT_EQ(read_buf - req->buffer, (start_lba - req->lba) * rd()->get_blk_size(), + DEBUG_ASSERT_EQ(read_buf - req.buffer, (start_lba - req.lba) * rd()->get_blk_size(), "Read buffer size mismatch, expected: {}, actual: {}", - (start_lba - req->lba) * rd()->get_blk_size(), read_buf - req->buffer); - sisl::sg_list sgs; - sgs.size = blkids.blk_count() * rd()->get_blk_size(); - sgs.iovs.emplace_back(iovec{.iov_base = read_buf, .iov_len = sgs.size}); - read_buf += sgs.size; - futs.emplace_back(rd()->async_read(blkids, sgs, sgs.size, req->part_of_batch)); + (start_lba - req.lba) * rd()->get_blk_size(), read_buf - req.buffer); + auto sgs = std::make_unique< sisl::sg_list >(); + sgs->size = blkids.blk_count() * rd()->get_blk_size(); + sgs->iovs.emplace_back(iovec{.iov_base = read_buf, .iov_len = sgs->size}); + read_buf += sgs->size; + // un-batched read (see volume::write note on v8 io_batch). The sg_list must outlive the lazy task, so + // it is owned by sgs_keepalive (in the caller's coroutine frame), not this loop scope. + futs.emplace_back(rd()->async_read(blkids, *sgs, sgs->size, nullptr)); + sgs_keepalive.emplace_back(std::move(sgs)); prev_lba = start_lba; prev_nblks = blkids.blk_count(); } // if there are any holes at the end, fill them with zeros - if (prev_lba + prev_nblks < req->end_lba() + 1) { - auto holes_size = (req->end_lba() + 1 - (prev_lba + prev_nblks)) * rd()->get_blk_size(); + if (prev_lba + prev_nblks < req.end_lba() + 1) { + auto holes_size = (req.end_lba() + 1 - (prev_lba + prev_nblks)) * rd()->get_blk_size(); if (holes_size > 0) { std::memset(read_buf, 0, holes_size); read_buf += holes_size; } } - DEBUG_ASSERT_EQ(read_buf - req->buffer, req->nlbas * rd()->get_blk_size(), - "Read buffer size mismatch, expected: {}, actual: {}", req->nlbas * rd()->get_blk_size(), - read_buf - req->buffer); + DEBUG_ASSERT_EQ(read_buf - req.buffer, req.nlbas * rd()->get_blk_size(), + "Read buffer size mismatch, expected: {}, actual: {}", req.nlbas * rd()->get_blk_size(), + read_buf - req.buffer); } // Note: Metrics scrapping can happen at any point after volume instance is created and registered with metrics farm; diff --git a/src/lib/volume/volume.hpp b/src/lib/volume/volume.hpp index 2097c1b..206e908 100644 --- a/src/lib/volume/volume.hpp +++ b/src/lib/volume/volume.hpp @@ -15,11 +15,17 @@ *********************************************************************************/ #pragma once -#include +#include "hb_internal.hpp" +#include #include "sisl/utility/enum.hpp" #include #include -#include +#include +#include +#include +#include +#include +#include #if USE_FIXED_INDEX #include "index_fixed_table.hpp" @@ -28,19 +34,19 @@ #endif #include "volume_chunk_selector.hpp" +#include "io_req.hpp" #include "sisl/utility/atomic_counter.hpp" -#include namespace homeblocks { using VolIdxTablePtr = shared< VolumeIndexTable >; -using ReplDevPtr = shared< homestore::ReplDev >; +using ReplDevPtr = shared< homestore::repl_dev >; using index_cfg_t = homestore::BtreeConfig; -using read_blks_list_t = std::vector< std::pair< lba_t, homestore::MultiBlkId > >; +using read_blks_list_t = std::vector< std::pair< lba_t, homestore::multi_blk_id > >; struct vol_read_ctx { - vol_interface_req_ptr vol_req; + const io_req* req; uint32_t blk_size; index_kv_list_t index_kvs{}; }; @@ -62,35 +68,35 @@ struct MsgHeader { } }; -class VolumeMetrics : public sisl::MetricsGroupWrapper { +class VolumeMetrics : public sisl::MetricsGroup { public: - explicit VolumeMetrics(const std::string& vol_name) : sisl::MetricsGroupWrapper("Volume", vol_name) { + explicit VolumeMetrics(const std::string& vol_name) : sisl::MetricsGroup("volume", vol_name) { // counters - REGISTER_COUNTER(volume_read_count, "Total Volume read operations", "volume_op_count", {"op", "read"}); - REGISTER_COUNTER(volume_write_count, "Total Volume write operations", "volume_op_count", {"op", "write"}); - REGISTER_COUNTER(volume_write_size_total, "Total Volume data size written", "volume_data_size", + REGISTER_COUNTER(volume_read_count, "Total volume read operations", "volume_op_count", {"op", "read"}); + REGISTER_COUNTER(volume_write_count, "Total volume write operations", "volume_op_count", {"op", "write"}); + REGISTER_COUNTER(volume_write_size_total, "Total volume data size written", "volume_data_size", {"op", "write"}); - REGISTER_COUNTER(volume_read_size_total, "Total Volume data size read", "volume_data_size", {"op", "read"}); + REGISTER_COUNTER(volume_read_size_total, "Total volume data size read", "volume_data_size", {"op", "read"}); // gauges - REGISTER_GAUGE(volume_data_used_size, "Total Volume data used size"); + REGISTER_GAUGE(volume_data_used_size, "Total volume data used size"); // histograms REGISTER_HISTOGRAM(volume_write_size_distribution, "Distribution of volume write sizes", HistogramBucketsType(OpSizeBuckets)); REGISTER_HISTOGRAM(volume_read_size_distribution, "Distribution of volume read sizes", HistogramBucketsType(OpSizeBuckets)); - REGISTER_HISTOGRAM(volume_read_latency, "Volume overall read latency", "volume_op_latency", {"op", "read"}, + REGISTER_HISTOGRAM(volume_read_latency, "volume overall read latency", "volume_op_latency", {"op", "read"}, HistogramBucketsType(OpLatecyBuckets)); - REGISTER_HISTOGRAM(volume_write_latency, "Volume overall write latency", "volume_op_latency", {"op", "write"}, + REGISTER_HISTOGRAM(volume_write_latency, "volume overall write latency", "volume_op_latency", {"op", "write"}, HistogramBucketsType(OpLatecyBuckets)); - REGISTER_HISTOGRAM(volume_data_read_latency, "Volume data blocks read latency", "volume_data_op_latency", + REGISTER_HISTOGRAM(volume_data_read_latency, "volume data blocks read latency", "volume_data_op_latency", {"op", "read"}, HistogramBucketsType(OpLatecyBuckets)); - REGISTER_HISTOGRAM(volume_data_write_latency, "Volume data blocks write latency", "volume_data_op_latency", + REGISTER_HISTOGRAM(volume_data_write_latency, "volume data blocks write latency", "volume_data_op_latency", {"op", "write"}, HistogramBucketsType(OpLatecyBuckets)); - REGISTER_HISTOGRAM(volume_map_read_latency, "Volume mapping read latency", "volume_map_op_latency", + REGISTER_HISTOGRAM(volume_map_read_latency, "volume mapping read latency", "volume_map_op_latency", {"op", "read"}, HistogramBucketsType(OpLatecyBuckets)); - REGISTER_HISTOGRAM(volume_map_write_latency, "Volume mapping write latency", "volume_map_op_latency", + REGISTER_HISTOGRAM(volume_map_write_latency, "volume mapping write latency", "volume_map_op_latency", {"op", "write"}, HistogramBucketsType(OpLatecyBuckets)); - REGISTER_HISTOGRAM(volume_journal_write_latency, "Volume journal write latency", "volume_journal_op_latency", + REGISTER_HISTOGRAM(volume_journal_write_latency, "volume journal write latency", "volume_journal_op_latency", {"op", "write"}, HistogramBucketsType(OpLatecyBuckets)); register_me_to_farm(); @@ -106,7 +112,7 @@ class VolumeMetrics : public sisl::MetricsGroupWrapper { void on_gather(); }; -class Volume : public std::enable_shared_from_this< Volume > { +class volume : public std::enable_shared_from_this< volume > { public: inline static auto const VOL_META_NAME = std::string("Volume2"); // different from old releae; private: @@ -123,7 +129,7 @@ class Volume : public std::enable_shared_from_this< Volume > { uint64_t size; // privisioned size in bytes of volume; volume_id_t id; char name[VOL_NAME_SIZE]; - vol_state state{vol_state::INIT}; + volume_state state{volume_state::INIT}; uint64_t ordinal; // Id unique to local homeblk instance. uint32_t pdev_id; // All chunks for this volume allocated from this physical dev. uint32_t num_chunks; @@ -152,46 +158,47 @@ class Volume : public std::enable_shared_from_this< Volume > { } homestore::chunk_num_t* get_chunk_ids_mutable() { - return r_cast< homestore::chunk_num_t* >(uintptr_cast(this) + sizeof(vol_sb_t)); + return reinterpret_cast< homestore::chunk_num_t* >(reinterpret_cast< uint8_t* >(this) + sizeof(vol_sb_t)); } const homestore::chunk_num_t* get_chunk_ids() const { - return r_cast< const homestore::chunk_num_t* >(reinterpret_cast< const uint8_t* >(this) + sizeof(vol_sb_t)); + return reinterpret_cast< const homestore::chunk_num_t* >(reinterpret_cast< const uint8_t* >(this) + + sizeof(vol_sb_t)); } }; public: - explicit Volume(VolumeInfo&& info, shared< VolumeChunkSelector > vol_chunk_sel, + explicit volume(volume_info&& info, shared< VolumeChunkSelector > vol_chunk_sel, shared< VolumeChunkSelector > index_chunk_sel) : sb_{VOL_META_NAME}, volume_chunk_selector_{vol_chunk_sel}, index_chunk_selector_{index_chunk_sel} { - vol_info_ = std::make_shared< VolumeInfo >(info.id, info.size_bytes, info.page_size, info.name, info.ordinal); + vol_info_ = std::make_shared< volume_info >(info.id, info.size_bytes, info.page_size, info.name, info.ordinal); metrics_ = std::make_unique< VolumeMetrics >(vol_info_->name); } - explicit Volume(sisl::byte_view const& buf, void* cookie, shared< VolumeChunkSelector > vol_chunk_sel, + explicit volume(sisl::byte_view const& buf, void* cookie, shared< VolumeChunkSelector > vol_chunk_sel, shared< VolumeChunkSelector > index_chunk_sel); - Volume(Volume const& volume) = delete; - Volume(Volume&& volume) = default; - Volume& operator=(Volume const& volume) = delete; - Volume& operator=(Volume&& volume) = default; + volume(volume const& volume) = delete; + volume(volume&& volume) = default; + volume& operator=(volume const& volume) = delete; + volume& operator=(volume&& volume) = default; - virtual ~Volume() = default; + virtual ~volume() = default; // static APIs exposed to HomeBlks Implementation Layer; - static VolumePtr make_volume(sisl::byte_view const& buf, void* cookie, - shared< VolumeChunkSelector > volume_chunk_sel, - shared< VolumeChunkSelector > index_chunk_sel) { - auto vol = std::make_shared< Volume >(buf, cookie, volume_chunk_sel, index_chunk_sel); + static volume_handle make_volume(sisl::byte_view const& buf, void* cookie, + shared< VolumeChunkSelector > volume_chunk_sel, + shared< VolumeChunkSelector > index_chunk_sel) { + auto vol = std::make_shared< volume >(buf, cookie, volume_chunk_sel, index_chunk_sel); auto ret = vol->init(true /*is_recovery*/); return ret ? vol : nullptr; } - void get_stats(VolumeStats& stats) const { + void get_stats(volume_stats& stats) const { stats.id = vol_info_->id; stats.state = sb_->state; } - static VolumePtr make_volume(VolumeInfo&& info, shared< VolumeChunkSelector > volume_chunk_sel, - shared< VolumeChunkSelector > index_chunk_sel) { - auto vol = std::make_shared< Volume >(std::move(info), volume_chunk_sel, index_chunk_sel); + static volume_handle make_volume(volume_info&& info, shared< VolumeChunkSelector > volume_chunk_sel, + shared< VolumeChunkSelector > index_chunk_sel) { + auto vol = std::make_shared< volume >(std::move(info), volume_chunk_sel, index_chunk_sel); auto ret = vol->init(false /* is_recovery */); // in failure case, volume shared ptr will be destroyed automatically; return ret ? vol : nullptr; @@ -201,9 +208,9 @@ class Volume : public std::enable_shared_from_this< Volume > { volume_id_t id() const { return vol_info_->id; }; uint64_t ordinal() const { return vol_info_->ordinal; } std::string id_str() const { return boost::uuids::to_string(vol_info_->id); }; - ReplDevPtr rd() const { return rd_; } + const ReplDevPtr& rd() const { return rd_; } - VolumeInfoPtr info() const { return vol_info_; } + volume_info_ptr info() const { return vol_info_; } std::string to_string() { return vol_info_->to_string(); } @@ -213,17 +220,17 @@ class Volume : public std::enable_shared_from_this< Volume > { VolIdxTablePtr init_index_table(bool is_recovery, VolIdxTablePtr tbl = nullptr); uint64_t get_index_size(); - bool is_online() const { return m_state_.load() == vol_state::ONLINE; } + bool is_online() const { return m_state_.load() == volume_state::ONLINE; } - void destroy(); - bool is_destroying() const { return m_state_.load() == vol_state::DESTROYING; } + sisl::async::task< void > destroy(); + bool is_destroying() const { return m_state_.load() == volume_state::DESTROYING; } bool is_destroy_started() const { return destroy_started_.load(); } - bool is_offline() const { return m_state_.load() == vol_state::OFFLINE; } + bool is_offline() const { return m_state_.load() == volume_state::OFFLINE; } // // This API will be called to set the volume state and persist to disk; // - void state_change(vol_state s) { + void state_change(volume_state s) { if (sb_->state != s) { sb_->state = s; sb_.write(); @@ -231,12 +238,9 @@ class Volume : public std::enable_shared_from_this< Volume > { } } - VolumeManager::NullAsyncResult write(const vol_interface_req_ptr& vol_req); + async_status write(io_req& vol_req); - VolumeManager::Result< folly::Unit > write_to_index(lba_t start_lba, lba_t end_lba, - std::unordered_map< lba_t, BlockInfo >& blocks_info); - - VolumeManager::NullAsyncResult read(const vol_interface_req_ptr& req); + async_status read(io_req& req); // // if destroy_started_ is true, it means volume destroy has started and we should not call remove again; @@ -259,36 +263,46 @@ class Volume : public std::enable_shared_from_this< Volume > { // bool init(bool is_recovery); - VolumeManager::NullResult verify_checksum(vol_read_ctx const& read_ctx); + status verify_checksum(vol_read_ctx const& read_ctx); - void submit_read_to_backend(read_blks_list_t const& blks_to_read, const vol_interface_req_ptr& req, - std::vector< folly::Future< std::error_code > >& futs); + void submit_read_to_backend(read_blks_list_t const& blks_to_read, const io_req& req, + std::vector< sisl::async::task< iomgr::io_result > >& futs, + std::vector< std::unique_ptr< sisl::sg_list > >& sgs_keepalive); void generate_blkids_to_read(const index_kv_list_t& index_kvs, read_blks_list_t& blks_to_read); - VolumeManager::NullResult read_from_index(const vol_interface_req_ptr& req, index_kv_list_t& index_kvs); - private: - VolumeInfoPtr vol_info_; // volume info - ReplDevPtr rd_; // replication device for this volume, which provides read/write APIs to the volume; - VolIdxTablePtr indx_tbl_; // index table for this volume - superblk< vol_sb_t > sb_; // meta data of the volume + volume_info_ptr vol_info_; // volume info + ReplDevPtr rd_; // replication device for this volume, which provides read/write APIs to the volume; + VolIdxTablePtr indx_tbl_; // index table for this volume + superblk< vol_sb_t > sb_; // meta data of the volume shared< VolumeChunkSelector > volume_chunk_selector_; // volume chunk selector. shared< VolumeChunkSelector > index_chunk_selector_; // index chunk selector. sisl::atomic_counter< uint64_t > outstanding_reqs_{0}; // number of outstanding requests std::atomic< bool > destroy_started_{ false}; // indicates if volume destroy has started, avoid destroy to be executed more than once. - std::atomic< vol_state > m_state_; // in-memory sb state, avoid taking lock in IO path; + std::atomic< volume_state > m_state_; // in-memory sb state, avoid taking lock in IO path; std::unique_ptr< VolumeMetrics > metrics_; }; +// RAII: marks an IO in-flight on its volume for the op's duration -- so destroy()/shutdown() wait for it (and +// the volume stays alive). Replaces the refcount the old heap-allocated request carried; lives in the +// async_read/async_write coroutine frame. +struct vol_io_guard { + volume_handle vol; + explicit vol_io_guard(volume_handle v) : vol(std::move(v)) { vol->inc_ref(); } + ~vol_io_guard() { vol->dec_ref(); } + vol_io_guard(vol_io_guard const&) = delete; + vol_io_guard& operator=(vol_io_guard const&) = delete; +}; + struct vol_repl_ctx : public homestore::repl_req_ctx { sisl::io_blob_safe hdr_buf_; sisl::io_blob_safe key_buf_; vol_repl_ctx(uint32_t hdr_extn_size, uint32_t key_size = 0) : homestore::repl_req_ctx{} { - hdr_buf_ = std::move(sisl::io_blob_safe{uint32_cast(sizeof(MsgHeader) + hdr_extn_size), 0}); + hdr_buf_ = std::move(sisl::io_blob_safe{static_cast< uint32_t >(sizeof(MsgHeader) + hdr_extn_size), 0}); new (hdr_buf_.bytes()) MsgHeader(); if (key_size) { key_buf_ = std::move(sisl::io_blob_safe{key_size, 0}); } @@ -300,10 +314,10 @@ struct vol_repl_ctx : public homestore::repl_req_ctx { template < typename T > T* to() { - return r_cast< T* >(this); + return reinterpret_cast< T* >(this); } - MsgHeader* header() { return r_cast< MsgHeader* >(hdr_buf_.bytes()); } + MsgHeader* header() { return reinterpret_cast< MsgHeader* >(hdr_buf_.bytes()); } uint8_t* header_extn() { return hdr_buf_.bytes() + sizeof(MsgHeader); } sisl::io_blob_safe& header_buf() { return hdr_buf_; } @@ -315,8 +329,12 @@ struct vol_repl_ctx : public homestore::repl_req_ctx { template < typename T > struct repl_result_ctx : public vol_repl_ctx { - folly::Promise< T > promise_; - VolumePtr vol_ptr_{nullptr}; + // Cross-thread single-shot completion for the journal write. The producer + // (HomeBlocksImpl::on_write, on the commit thread) calls promise_.complete(value); the consumer + // (volume::write coroutine) co_awaits promise_ directly. This ctx is heap-allocated and kept alive by an + // intrusive_ptr through completion, satisfying value_awaitable's stable-address / outlive-the-await rule. + sisl::async::value_awaitable< T > promise_; + volume_handle vol_ptr_{nullptr}; template < typename... Args > static intrusive< repl_result_ctx< T > > make(Args&&... args) { @@ -324,7 +342,6 @@ struct repl_result_ctx : public vol_repl_ctx { } repl_result_ctx(uint32_t hdr_extn_size, uint32_t key_size = 0) : vol_repl_ctx{hdr_extn_size, key_size} {} - folly::SemiFuture< T > result() { return promise_.getSemiFuture(); } }; } // namespace homeblocks diff --git a/src/lib/volume/volume_chunk_selector.cpp b/src/lib/volume/volume_chunk_selector.cpp index 010048e..ab89d4a 100644 --- a/src/lib/volume/volume_chunk_selector.cpp +++ b/src/lib/volume/volume_chunk_selector.cpp @@ -13,7 +13,7 @@ * *********************************************************************************/ #include "volume_chunk_selector.hpp" -#include +#include "hb_internal.hpp" #include namespace homeblocks { @@ -122,7 +122,7 @@ homestore::cshared< Chunk > VolumeChunkSelector::select_chunk(homestore::blk_cou #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("vol_num_chunks_force_resize_op")) { // this is to simulate no blks available. - LOGI("Volume resize op flip is set."); + LOGI("volume resize op flip is set."); resize_volume_num_chunks(nblks, volc); } #endif @@ -141,10 +141,9 @@ homestore::cshared< Chunk > VolumeChunkSelector::select_chunk(homestore::blk_cou // virtual_dev will call select_chunk again. uint64_t num_active_chunks = volc->num_active_chunks; for (uint64_t i = 0; i < num_active_chunks; i++) { - if (*volc->m_next_chunk_index >= num_active_chunks) { *volc->m_next_chunk_index = 0; } - - auto chunk = volc->m_chunks[*volc->m_next_chunk_index]; - *volc->m_next_chunk_index = ((*volc->m_next_chunk_index) + 1); + // Lock-free round-robin over the active chunks (shared atomic cursor). + auto const idx = volc->m_next_chunk_index.fetch_add(1, std::memory_order_relaxed) % num_active_chunks; + auto chunk = volc->m_chunks[idx]; if (chunk && chunk->available_blks() > 0) { return chunk->get_internal_chunk(); } } @@ -177,7 +176,7 @@ void VolumeChunkSelector::resize_volume_num_chunks(homestore::blk_count_t nblks, #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("vol_num_chunks_force_resize_op")) { // this is to simulate no blks available. - LOGI("Volume resize op flip is set."); + LOGI("volume resize op flip is set."); force_resize = true; } #endif @@ -268,7 +267,7 @@ bool VolumeChunkSelector::recover_chunks(uint64_t volume_ordinal, uint32_t pdev, const std::vector< chunk_num_t >& chunk_ids) { std::lock_guard lock(m_chunk_sel_mutex); auto volc = m_volume_chunks[volume_ordinal]; - RELEASE_ASSERT(!volc, "Volume already exists"); + RELEASE_ASSERT(!volc, "volume already exists"); auto chunk_size = m_all_chunks.begin()->second->size(); volc = std::make_shared< VolumeChunksInfo >(); @@ -312,7 +311,7 @@ void VolumeChunkSelector::release_chunks(uint64_t volume_ordinal) { std::string str; uint64_t count = 0; auto volc = m_volume_chunks[volume_ordinal]; - RELEASE_ASSERT(volc, "Volume doesnt exists"); + RELEASE_ASSERT(volc, "volume doesnt exists"); for (auto chunk : volc->m_chunks) { if (chunk) { @@ -384,4 +383,4 @@ std::string VolumeChunkSelector::dump_chunks() const { return str; } -} // namespace homeblocks \ No newline at end of file +} // namespace homeblocks diff --git a/src/lib/volume/volume_chunk_selector.hpp b/src/lib/volume/volume_chunk_selector.hpp index 282c49f..3ad2c3c 100644 --- a/src/lib/volume/volume_chunk_selector.hpp +++ b/src/lib/volume/volume_chunk_selector.hpp @@ -14,13 +14,13 @@ *********************************************************************************/ #pragma once +#include #include -#include -#include -#include +#include +#include #include #include -#include "homeblks/common.hpp" +#include "hb_internal.hpp" namespace homeblocks { @@ -46,11 +46,12 @@ class VolumeChunkSelector : public homestore::ChunkSelector { // max_num_chunks is total chunks possible for whole volume // size. num_active_chunks is the number of chunks which is - // used for allocation. next_chunk_index is thread local - // which does round robin on the active chunks + // used for allocation. m_next_chunk_index is a shared atomic cursor + // that round-robins (lock-free) over the active chunks; select_chunk() + // runs without the selector mutex, so this must be atomic. uint64_t max_num_chunks; std::atomic< uint64_t > num_active_chunks{0}; - folly::ThreadLocal< uint32_t > m_next_chunk_index; + std::atomic< uint32_t > m_next_chunk_index{0}; uint64_t ordinal; uint32_t pdev; }; diff --git a/src/lib/volume_mgr.cpp b/src/lib/volume_mgr.cpp index 3d43a35..9b2a2d7 100644 --- a/src/lib/volume_mgr.cpp +++ b/src/lib/volume_mgr.cpp @@ -15,15 +15,15 @@ *********************************************************************************/ #include #include -#include +#include #include "volume/volume.hpp" #include "homeblks_impl.hpp" +#include "coro_helpers.hpp" namespace homeblocks { -std::shared_ptr< VolumeManager > HomeBlocksImpl::volume_manager() { return shared_from_this(); } void HomeBlocksImpl::on_vol_meta_blk_found(sisl::byte_view const& buf, void* cookie) { - auto vol_ptr = Volume::make_volume(buf, cookie, volume_chunk_selector_, index_chunk_selector_); + auto vol_ptr = volume::make_volume(buf, cookie, volume_chunk_selector_, index_chunk_selector_); auto id = vol_ptr->id(); { @@ -47,8 +47,9 @@ void HomeBlocksImpl::on_vol_meta_blk_found(sisl::byte_view const& buf, void* coo if (vol_ptr->is_destroying()) { // resume volume destroying; - LOGINFO("Volume {} is in destroying state, resume destroy", vol_ptr->id_str()); - remove_volume(id); + LOGINFO("volume {} is in destroying state, resume destroy", vol_ptr->id_str()); + // fire-and-forget: remove_volume is a coroutine whose work is scheduled on a worker; start it detached. + detail::detach(remove_volume(id)); } } @@ -80,10 +81,10 @@ shared< hs_index_table_t > HomeBlocksImpl::recover_index_table(homestore::superb // 3. now we have a race that allow create volume to go through and graceful shutdown also happen in parallel which will // cause crash; // -VolumeManager::NullAsyncResult HomeBlocksImpl::create_volume(VolumeInfo&& vol_info) { +async_result< volume_handle > HomeBlocksImpl::create_volume(volume_info vol_info) { if (is_restricted()) { LOGE("Can't serve volume create, System is in restricted mode."); - return std::unexpected(VolumeError::UNSUPPORTED_OP); + co_return std::unexpected(std::errc::operation_not_supported); } inc_ref(); @@ -96,17 +97,17 @@ VolumeManager::NullAsyncResult HomeBlocksImpl::create_volume(VolumeInfo&& vol_in vol_info.ordinal = ordinal_reserver_->reserve(); if (vol_info.ordinal >= MAX_NUM_VOLUMES) { LOGE("No space to create volume with id: {}", boost::uuids::to_string(id)); - return std::unexpected(VolumeError::INTERNAL_ERROR); + co_return std::unexpected(volume_error::INTERNAL_ERROR); } if (auto it = vol_map_.find(id); it != vol_map_.end()) { LOGW("create_volume with input id: {} already exists,", boost::uuids::to_string(id)); dec_ref(); - return std::unexpected(VolumeError::INVALID_ARG); + co_return std::unexpected(std::errc::invalid_argument); } } - auto vol_ptr = Volume::make_volume(std::move(vol_info), volume_chunk_selector_, index_chunk_selector_); + auto vol_ptr = volume::make_volume(std::move(vol_info), volume_chunk_selector_, index_chunk_selector_); if (vol_ptr) { auto lg = std::scoped_lock(vol_lock_); vol_map_.emplace(std::make_pair(id, vol_ptr)); @@ -114,91 +115,95 @@ VolumeManager::NullAsyncResult HomeBlocksImpl::create_volume(VolumeInfo&& vol_in } else { LOGE("failed to create volume with id: {}", boost::uuids::to_string(id)); dec_ref(); - return std::unexpected(VolumeError::INTERNAL_ERROR); + co_return std::unexpected(volume_error::INTERNAL_ERROR); } dec_ref(); - return NullResult(); + co_return vol_ptr; } // // Why we don't need do ref_cnt for remove_volume: // vol in destroying state already indicates an outstanding volume which consumed in no_outstanding_vols() API; // -VolumeManager::NullAsyncResult HomeBlocksImpl::remove_volume(const volume_id_t& id) { - if (is_restricted()) { - LOGE("Can't serve volume remove, System is in restricted mode."); - return std::unexpected(VolumeError::UNSUPPORTED_OP); +// Coroutine that performs the actual volume teardown on a worker reactor. It co_awaits volume::destroy() +// (which co_awaits the repl-dev removal and a forced index CP flush), so the reactor yields during those waits +// instead of parking -- which is what lets the CP flush complete. Launched via detach from remove_volume. +sisl::async::task< void > HomeBlocksImpl::do_remove_volume(volume_id_t id) { + // 1. get the volume ptr from the map; + volume_handle vol_ptr = nullptr; + { + auto lg = std::scoped_lock(vol_lock_); + if (auto it = vol_map_.find(id); it != vol_map_.end()) { + vol_ptr = it->second; + } else { + LOGWARN("volume with id {} not found, cannot remove", boost::uuids::to_string(id)); + co_return; + } } - auto vol = lookup_volume(id); - if (vol == nullptr) { - LOGE("Volume with id {} not found, cannot remove", boost::uuids::to_string(id)); - return std::unexpected(VolumeError::INVALID_ARG); - } else if (vol->is_offline()) { - LOGE("Volume {} is offline, cannot remove", vol->id_str()); - return std::unexpected(VolumeError::VOLUME_OFFLINE); - } + vol_ptr->state_change(volume_state::DESTROYING); - LOGINFO("remove_volume with input id: {}", boost::uuids::to_string(id)); - iomanager.run_on_forget(iomgr::reactor_regex::random_worker, [this, id]() { - // 1. get the volume ptr from the map; - VolumePtr vol_ptr = nullptr; + // if vol is already started with destroy or there is any outstanding reqs on the vol, we will not do anything + // on this vol and let reaper thread to handle it + if (vol_ptr->can_remove()) { + // 2. do volume destroy; + co_await vol_ptr->destroy(); +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("vol_destroy_crash_simulation")) { + crash_simulated_ = true; + co_return; + } +#endif + // 3. remove volume from vol_map; { auto lg = std::scoped_lock(vol_lock_); - if (auto it = vol_map_.find(id); it != vol_map_.end()) { - vol_ptr = it->second; - } else { - LOGWARN("Volume with id {} not found, cannot remove", boost::uuids::to_string(id)); - return NullResult(); - } + vol_map_.erase(vol_ptr->id()); + ordinal_reserver_->unreserve(vol_ptr->info()->ordinal); } - vol_ptr->state_change(vol_state::DESTROYING); + LOGINFO("volume {} ordinal={} removed successfully", vol_ptr->id_str(), vol_ptr->info()->ordinal); + } else { + LOGD("volume {} is in destroying state or has outstanding requests: {}, backing off and wait for GC to " + "cleanup.", + vol_ptr->id_str(), vol_ptr->num_outstanding_reqs()); + } + // volume Destructor will be called after vol_ptr goes out of scope; +} - // if vol is already started with destroy or there is any outstanding reqs on the vol, we will not do anything - // on this vol and let reaper thread to handle it - if (vol_ptr->can_remove()) { - // 2. do volume destroy; - vol_ptr->destroy(); -#ifdef _PRERELEASE - if (iomgr_flip::instance()->test_flip("vol_destroy_crash_simulation")) { - crash_simulated_ = true; - return NullResult(); - } -#endif - // 3. remove volume from vol_map; - { - auto lg = std::scoped_lock(vol_lock_); - vol_map_.erase(vol_ptr->id()); - ordinal_reserver_->unreserve(vol_ptr->info()->ordinal); - } +async_status HomeBlocksImpl::remove_volume(const volume_id_t& id) { + if (is_restricted()) { + LOGE("Can't serve volume remove, System is in restricted mode."); + co_return std::unexpected(std::errc::operation_not_supported); + } - LOGINFO("Volume {} ordinal={} removed successfully", vol_ptr->id_str(), vol_ptr->info()->ordinal); - } else { - if (vol_ptr) { - LOGD("Volume {} is in destroying state or has outstanding requests: {}, backing off and wait for GC to " - "cleanup.", - vol_ptr->id_str(), vol_ptr->num_outstanding_reqs()); - } else { - LOGWARN("Volume with id {} not found, cannot remove", boost::uuids::to_string(id)); - } - } - // Volume Destructor will be called after vol_ptr goes out of scope; - return NullResult(); - }); + auto vol = get_volume(id); + if (!vol) { + LOGE("volume with id {} not found, cannot remove", boost::uuids::to_string(id)); + co_return std::unexpected(std::errc::invalid_argument); + } else if ((*vol)->is_offline()) { + LOGE("volume {} is offline, cannot remove", (*vol)->id_str()); + co_return std::unexpected(volume_error::OFFLINE); + } - return NullResult(); + LOGINFO("remove_volume with input id: {}", boost::uuids::to_string(id)); + // volume::destroy() co_awaits a forced CP flush (IndexTable::destroy) and the repl-dev removal, so it must + // run as a coroutine on the worker reactor (detach) -- a blocking callable would park the reactor and the + // flush it awaits could never run. See do_remove_volume. + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, + [this, id]() { detail::detach(do_remove_volume(id)); }); + + co_return ok(); } -VolumePtr HomeBlocksImpl::lookup_volume(const volume_id_t& id) { +result< volume_handle > HomeBlocksImpl::get_volume(const volume_id_t& id) const { auto lg = std::shared_lock(vol_lock_); if (auto it = vol_map_.find(id); it != vol_map_.end()) { return it->second; } - return nullptr; + return std::unexpected(volume_error::UNKNOWN_VOLUME); } void HomeBlocksImpl::update_vol_sb_cb(uint64_t volume_ordinal, const std::vector< chunk_num_t >& chunk_ids) { - VolumePtr vol_ptr = nullptr; + volume_handle vol_ptr = nullptr; { auto lg = std::shared_lock(vol_lock_); for (auto it = vol_map_.begin(); it != vol_map_.end(); it++) { @@ -209,109 +214,102 @@ void HomeBlocksImpl::update_vol_sb_cb(uint64_t volume_ordinal, const std::vector } } - RELEASE_ASSERT(vol_ptr != nullptr, "Volume not found"); + RELEASE_ASSERT(vol_ptr != nullptr, "volume not found"); vol_ptr->update_vol_sb_cb(chunk_ids); } -bool HomeBlocksImpl::get_stats(volume_id_t id, VolumeStats& stats) const { +result< volume_stats > HomeBlocksImpl::get_stats(volume_id_t id) const { auto lg = std::shared_lock(vol_lock_); auto it = vol_map_.find(id); if (it == vol_map_.end()) { - LOGE("Volume with id {} not found, cannot get stats", boost::uuids::to_string(id)); - return false; + LOGE("volume with id {} not found, cannot get stats", boost::uuids::to_string(id)); + return std::unexpected(volume_error::UNKNOWN_VOLUME); } + volume_stats stats; it->second->get_stats(stats); - return true; + return stats; } -void HomeBlocksImpl::get_volume_ids(std::vector< volume_id_t >& vol_ids) const { +std::vector< volume_id_t > HomeBlocksImpl::volume_ids() const { auto lg = std::shared_lock(vol_lock_); + std::vector< volume_id_t > vol_ids; + vol_ids.reserve(vol_map_.size()); for (const auto& it : vol_map_) { vol_ids.push_back(it.first); } + return vol_ids; } -VolumeManager::NullAsyncResult HomeBlocksImpl::write(const VolumePtr& vol, const vol_interface_req_ptr& req) { - if (is_restricted()) { - LOGE("Can't serve write, System is in restricted mode."); - return std::unexpected(VolumeError::UNSUPPORTED_OP); - } else if (vol->is_offline()) { - LOGE("Can't serve write, Volume {} is offline.", vol->id_str()); - return std::unexpected(VolumeError::VOLUME_OFFLINE); +// ======================= public data-plane: free functions over a volume_handle ======================= +// (declared in homeblks/home_blocks.hpp). Each guards against a system/volume that cannot serve IO, converts the +// byte address to the volume's block granularity, builds an internal vol_interface_req, and drives +// volume::read/write. read/write resolve to the byte count transferred (sgs.size); unmap is currently a no-op. + +// Shared guard. Returns the volume's block size on success, or the rejecting volume_error. +static result< uint32_t > io_precheck(volume_handle const& vol, uint64_t addr, uint64_t len, const char* op) { + auto hb = HomeBlocksImpl::instance(); + if (hb->is_restricted()) { + LOGE("Can't serve {}, system is in restricted mode.", op); + return std::unexpected(std::errc::operation_not_supported); } - - if (vol->is_destroying() || is_shutting_down()) { - LOGE( - "Can't serve write, Volume {} is_destroying: {} is either in destroying state or System is shutting down. ", - vol->id_str(), vol->is_destroying()); - return std::unexpected(VolumeError::UNSUPPORTED_OP); + if (vol->is_offline()) { + LOGE("Can't serve {}, volume {} is offline.", op, vol->id_str()); + return std::unexpected(volume_error::OFFLINE); } + if (vol->is_destroying() || hb->is_shutting_down()) { + LOGE("Can't serve {}, volume {} is destroying or system is shutting down.", op, vol->id_str()); + return std::unexpected(std::errc::operation_not_supported); + } + auto const blk_size = vol->rd()->get_blk_size(); + if (addr % blk_size != 0 || len % blk_size != 0) { + LOGE("Can't serve {} on volume {}: addr {} / len {} not block-aligned ({}B).", op, vol->id_str(), addr, len, + blk_size); + return std::unexpected(std::errc::invalid_argument); + } + return blk_size; +} +async_result< size_t > async_write(volume_handle const& vol, uint64_t addr, sisl::sg_list sgs) { + auto blk_size = io_precheck(vol, addr, sgs.size, "write"); + if (!blk_size) co_return std::unexpected(blk_size.error()); + // TODO: scatter-gather (multi-iov) not yet plumbed through the write pipeline; single contiguous buffer only. + if (sgs.iovs.size() != 1) co_return std::unexpected(std::errc::invalid_argument); #ifdef _PRERELEASE - if (delay_fake_io(vol)) { - // If we are delaying IO, we return immediately without calling vol->write - // and let the delay flip handle the completion later. - return NullResult(); - } + if (HomeBlocksImpl::instance()->delay_fake_io(vol)) { co_return sgs.size; } // delayed; completes via flip later #endif - return vol->write(req); + vol_io_guard guard{vol}; // marks this IO in-flight on the volume (and keeps it alive) for the op's duration + io_req req{.buffer = static_cast< uint8_t* >(sgs.iovs[0].iov_base), + .lba = addr / *blk_size, + .nlbas = static_cast< lba_count_t >(sgs.size / *blk_size)}; + if (auto st = co_await vol->write(req); !st) { co_return std::unexpected(st.error()); } + co_return sgs.size; } -VolumeManager::NullAsyncResult HomeBlocksImpl::read(const VolumePtr& vol, const vol_interface_req_ptr& req) { - if (is_restricted()) { - LOGE("Can't serve read, System is in restricted mode."); - return std::unexpected(VolumeError::UNSUPPORTED_OP); - } else if (vol->is_offline()) { - LOGE("Can't serve read, Volume {} is offline.", vol->id_str()); - return std::unexpected(VolumeError::VOLUME_OFFLINE); - } - - if (vol->is_destroying() || is_shutting_down()) { - LOGE("Can't serve read, Volume {} is_destroying: {} is either in destroying state or System is shutting down. ", - vol->id_str(), vol->is_destroying()); - return std::unexpected(VolumeError::UNSUPPORTED_OP); - } - +async_result< size_t > async_read(volume_handle const& vol, uint64_t addr, sisl::sg_list sgs) { + auto blk_size = io_precheck(vol, addr, sgs.size, "read"); + if (!blk_size) co_return std::unexpected(blk_size.error()); + if (sgs.iovs.size() != 1) co_return std::unexpected(std::errc::invalid_argument); #ifdef _PRERELEASE - if (delay_fake_io(vol)) { - // If we are delaying IO, we return immediately without calling vol->read - // and let the delay flip handle the completion later. - return NullResult(); - } + if (HomeBlocksImpl::instance()->delay_fake_io(vol)) { co_return sgs.size; } #endif - return vol->read(req); + vol_io_guard guard{vol}; // marks this IO in-flight on the volume (and keeps it alive) for the op's duration + io_req req{.buffer = static_cast< uint8_t* >(sgs.iovs[0].iov_base), + .lba = addr / *blk_size, + .nlbas = static_cast< lba_count_t >(sgs.size / *blk_size)}; + if (auto st = co_await vol->read(req); !st) { co_return std::unexpected(st.error()); } + co_return sgs.size; } -VolumeManager::NullAsyncResult HomeBlocksImpl::unmap(const VolumePtr& vol, const vol_interface_req_ptr& req) { +async_status async_unmap(volume_handle const& vol, uint64_t addr, uint64_t len) { LOGWARN("Unmap to vol: {} not implemented", vol->id_str()); - - if (is_restricted()) { - LOGE("Can't serve unmap, System is in restricted mode."); - return std::unexpected(VolumeError::UNSUPPORTED_OP); - } else if (vol->is_offline()) { - LOGE("Can't serve unmap, Volume {} is offline.", vol->id_str()); - return std::unexpected(VolumeError::VOLUME_OFFLINE); - } - - if (vol->is_destroying() || is_shutting_down()) { - LOGE( - "Can't serve unmap, Volume {} is_destroying: {} is either in destroying state or System is shutting down. ", - vol->id_str(), vol->is_destroying()); - return std::unexpected(VolumeError::UNSUPPORTED_OP); - } - - return NullResult(); + auto blk_size = io_precheck(vol, addr, len, "unmap"); + if (!blk_size) co_return std::unexpected(blk_size.error()); + co_return ok(); } -// -// we have to allow submit_io_batch even though a volume is in destroying state, because destroy relies on outstanding -// IOs to decrease to zero to proceed, e.g. submit_io_batch will allow outstanding io to complete; -// -void HomeBlocksImpl::submit_io_batch() { homestore::data_service().submit_io_batch(); } - void HomeBlocksImpl::on_write(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - const std::vector< homestore::MultiBlkId >& new_blkids, + const std::vector< homestore::multi_blk_id >& new_blkids, cintrusive< homestore::repl_req_ctx >& ctx) { // We are not expecting log reply for a graceful restart; @@ -319,17 +317,17 @@ void HomeBlocksImpl::on_write(int64_t lsn, const sisl::blob& header, const sisl: DEBUG_ASSERT(ctx != nullptr || !is_graceful_shutdown(), "repl ctx is null (recovery path) in graceful shutdown scenario, this is not expected!"); - repl_result_ctx< VolumeManager::NullResult >* repl_ctx{nullptr}; - if (ctx) { repl_ctx = boost::static_pointer_cast< repl_result_ctx< VolumeManager::NullResult > >(ctx).get(); } - auto msg_header = r_cast< MsgHeader* >(const_cast< uint8_t* >(header.cbytes())); + repl_result_ctx< status >* repl_ctx{nullptr}; + if (ctx) { repl_ctx = boost::static_pointer_cast< repl_result_ctx< status > >(ctx).get(); } + auto msg_header = reinterpret_cast< MsgHeader* >(const_cast< uint8_t* >(header.cbytes())); // Key contains the list of checksums and old blkids. Before we ack the client // request, we free the old blkid's. Also if its recovery we overwrite the index // with checksum and new blkid's. We need to overwrite index during recovery as all the // index writes may not be flushed to disk during crash. - VolumePtr vol_ptr{nullptr}; - auto journal_entry = r_cast< const VolJournalEntry* >(key.cbytes()); - auto key_buffer = r_cast< const uint8_t* >(journal_entry + 1); + volume_handle vol_ptr{nullptr}; + auto journal_entry = reinterpret_cast< const VolJournalEntry* >(key.cbytes()); + auto key_buffer = reinterpret_cast< const uint8_t* >(journal_entry + 1); if (repl_ctx == nullptr) { // For recovery path repl_ctx and vol_ptr wont be available. @@ -343,9 +341,9 @@ void HomeBlocksImpl::on_write(int64_t lsn, const sisl::blob& header, const sisl: lba_t start_lba = journal_entry->start_lba; for (auto& blkid : new_blkids) { for (uint32_t i = 0; i < blkid.blk_count(); i++) { - auto new_bid = BlkId{blkid.blk_num() + i, 1 /* nblks */, blkid.chunk_num()}; - auto csum = *r_cast< const homestore::csum_t* >(key_buffer); - blocks_info.emplace(start_lba + i, BlockInfo{new_bid, BlkId{}, csum}); + auto new_bid = blk_id{blkid.blk_num() + i, 1 /* nblks */, blkid.chunk_num()}; + auto csum = *reinterpret_cast< const homestore::csum_t* >(key_buffer); + blocks_info.emplace(start_lba + i, BlockInfo{new_bid, blk_id{}, csum}); key_buffer += sizeof(homestore::csum_t); } @@ -367,47 +365,41 @@ void HomeBlocksImpl::on_write(int64_t lsn, const sisl::blob& header, const sisl: // because we could have stale log entries which have old blkid's // which may be already freed. for (uint32_t i = 0; i < journal_entry->num_old_blks; i++) { - BlkId old_blkid = *r_cast< const BlkId* >(key_buffer); + blk_id old_blkid = *reinterpret_cast< const blk_id* >(key_buffer); if (repl_ctx == nullptr) { if (homestore::hs()->data_service().is_blk_alloced(old_blkid)) { LOGT("volume write on commit free blk {} start_lba {}", old_blkid, journal_entry->start_lba); - homestore::hs()->data_service().free_blk_now(old_blkid); + // free_blk_now is synchronous and returns a [[nodiscard]] status; this recovery-path free is + // best-effort (matches the prior void-returning behavior), so the result is intentionally ignored. + (void)homestore::hs()->data_service().free_blk_now(old_blkid); } } else { if (homestore::hs()->data_service().is_blk_alloced(old_blkid)) { LOGT("volume write on commit free blk {} start_lba {}", old_blkid, journal_entry->start_lba); - vol_ptr->rd()->async_free_blks(lsn, old_blkid); + // async_free_blks returns a lazy task; on_write is not a coroutine, so fire-and-forget it + // (detach starts it) rather than dropping the un-awaited task (which would never run). + detail::detach(vol_ptr->rd()->async_free_blks(lsn, old_blkid)); } } - key_buffer += sizeof(BlkId); + key_buffer += sizeof(blk_id); } #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("vol_write_crash_after_journal_write")) { // this is to simulate crash during write where both data and journal // is persisted. After recovery log entries are replayed. - LOGINFO("Volume write crash simulation flip is set, aborting"); + LOGINFO("volume write crash simulation flip is set, aborting"); return; } #endif - if (repl_ctx) { repl_ctx->promise_.setValue(NullResult()); } -} - -vol_interface_req::vol_interface_req(uint8_t* const buf, const uint64_t lba, const uint32_t nlbas, VolumePtr vol_ptr) : - buffer(buf), lba(lba), nlbas(nlbas), vol(vol_ptr) { - vol->inc_ref(1); -} - -void intrusive_ptr_release(vol_interface_req* req) { - if (req->refcount.decrement_testz()) { - req->vol->dec_ref(1); - req->free_yourself(); - } + // Deliver the journal-write completion to the awaiting volume::write coroutine; + // value_awaitable::complete resumes it inline on this commit thread. + if (repl_ctx) { repl_ctx->promise_.complete(ok()); } } #ifdef _PRERELEASE -bool HomeBlocksImpl::delay_fake_io(VolumePtr v) { +bool HomeBlocksImpl::delay_fake_io(volume_handle v) { if (iomgr_flip::instance()->delay_flip("vol_fake_io_delay_simulation", [this, v]() mutable { LOGI("Resuming fake IO delay flip is done. Do nothing "); v->dec_ref(); diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt new file mode 100644 index 0000000..15def27 --- /dev/null +++ b/src/test/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required (VERSION 3.11) + +# Test-only ublk adapter. Relax pedantic/unused-parameter for the kernel UAPI headers (ublk_cmd.h, +# ublksrv.h) pulled in transitively via ublkpp; keep -Werror/-Wall/-Wextra. +add_flags("-Wno-pedantic -Wno-unused-parameter") + +# ublkpp is a test_requires; if it isn't available (e.g. a tests-disabled build) skip the adapter+CLI. +find_package(UblkPP QUIET) +if (NOT UblkPP_FOUND) + message(STATUS "ublkpp not found; skipping homeblk_ublk adapter/CLI") + return() +endif() + +# Adapter: exposes a homeblocks volume as a ublkpp::ublk_disk. +add_library(homeblk_ublk_disk OBJECT) +target_sources(homeblk_ublk_disk PRIVATE + homeblk_disk.cpp +) +target_link_libraries(homeblk_ublk_disk + ${COMMON_DEPS} + UblkPP::UblkPP +) + +# CLI: brings up homeblocks and exposes a volume as /dev/ublkbN (ublkpp_disk-style). +add_executable(homeblk_ublk) +target_sources(homeblk_ublk PRIVATE + homeblk_ublk_cli.cpp + $ +) +target_link_libraries(homeblk_ublk + ${PROJECT_NAME} + ${COMMON_TEST_DEPS} + UblkPP::UblkPP + -rdynamic +) diff --git a/src/test/homeblk_disk.cpp b/src/test/homeblk_disk.cpp new file mode 100644 index 0000000..6170f2f --- /dev/null +++ b/src/test/homeblk_disk.cpp @@ -0,0 +1,150 @@ +/********************************************************************************* + * Modifications Copyright 2026 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include "homeblk_disk.hpp" + +#include +#include +#include +#include +#include + +#include // iomanager.run_on_forget / iomanager.post_msg_ring / iomgr::reactor_regex +#include +#include + +#include // build_cqe_state_data + sisl::async managed-user-data helpers + +#include "coro_helpers.hpp" // homeblocks::detail::detach (src/lib, on the include path) + +namespace homeblocks::ublk { + +// ublk speaks in 512-byte sectors regardless of the device's logical block size; ublkpp keeps this constant in an +// un-installed internal header, so we redefine the one value we need. +static constexpr uint32_t k_sector_shift = 9; + +// The per-IO worker: started on a worker reactor (so homestore I/O runs on a reactor, as it requires), it +// co_awaits the homeblocks op and -- resuming on the journal-commit reactor when it completes -- hands the result +// straight to the ublk queue's io_uring via IORING_OP_MSG_RING. `state_ud` is the managed-encoded per-IO +// cqe_state pointer; it lands as the target CQE's user_data, and `result` as its res, which is exactly what the +// queue's run_queue_loop reads to resume the per-IO coroutine and call ublksrv_complete_io. Parameters are taken +// by value so they live in the coroutine frame (a worker-reactor closure's captures would dangle across awaits). +static async_status run_hb_io(volume_handle vol, uint64_t addr, sisl::sg_list sgs, bool is_read, int queue_ring_fd, + uint64_t state_ud) { + auto const res = + is_read ? co_await async_read(vol, addr, std::move(sgs)) : co_await async_write(vol, addr, std::move(sgs)); + int const result = res.has_value() ? static_cast< int >(res.value()) : -EIO; + + // Post the completion CQE directly onto the queue's ring. We're on the commit reactor (uring-capable), so this + // is one queued SQE that the reactor's poll_once batches out -- no eventfd, no lock, no service loop. + if (auto const r = iomanager.post_msg_ring(queue_ring_fd, state_ud, result); r != 0) { + // Rare: not on a uring reactor (-ENODEV) or SQ full even after a flush (-EAGAIN). The completion MUST + // reach the queue ring or the IO stalls, so retry from a worker reactor (guaranteed uring-capable). + LOGWARN("homeblk_disk: direct msg_ring post returned {}; retrying via a worker reactor", r); + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, [queue_ring_fd, state_ud, result]() { + if (auto const r2 = iomanager.post_msg_ring(queue_ring_fd, state_ud, result); r2 != 0) { + LOGERROR("homeblk_disk: msg_ring post failed on worker reactor ({}); IO {:#x} may stall", r2, + state_ud); + } + }); + } + co_return ok(); +} + +HomeBlkDisk::HomeBlkDisk(std::shared_ptr< home_blocks > instance, volume_handle vol, volume_id_t vol_id, + uint64_t capacity, uint32_t page_size, uint32_t max_tx) : + ublkpp::ublk_disk(), + _instance(std::move(instance)), + _vol(std::move(vol)), + _vol_id(vol_id), + _capacity(capacity), + _page_size(page_size), + _id_str(boost::uuids::to_string(vol_id)) { + if (!_vol) throw std::runtime_error("HomeBlkDisk: null volume handle"); + if (page_size == 0 || (page_size & (page_size - 1)) != 0) + throw std::runtime_error("HomeBlkDisk: page_size must be a power of two"); + if (max_tx < (1u << k_sector_shift)) throw std::runtime_error("HomeBlkDisk: max_tx too small"); + + // homeblocks reads/writes through its own datapath, never the kernel page cache. + _direct_io = true; + + auto const bs_shift = static_cast< uint8_t >(std::countr_zero(page_size)); + auto& p = *params(); + p.basic.logical_bs_shift = bs_shift; + p.basic.physical_bs_shift = bs_shift; + // Leave max_sectors at the base default (DEF_BUF_SIZE >> 9), which tracks the ublk per-tag IO buffer + // (ublkpp's --max_io_size, default 512 KiB): a basic.max_sectors larger than that buffer is rejected by the + // kernel with EINVAL. Only clamp it DOWN if homeblocks' own max transfer (max_tx) is smaller, so we never + // hand homeblocks an IO bigger than it accepts. + p.basic.max_sectors = std::min(p.basic.max_sectors, static_cast< uint32_t >(max_tx >> k_sector_shift)); + p.basic.dev_sectors = capacity >> k_sector_shift; + // The kernel requires the device size to be a whole multiple of max_sectors. + p.basic.dev_sectors -= (p.basic.dev_sectors % p.basic.max_sectors); + + // DISCARD/WRITE_ZEROES map cleanly onto homeblocks async_unmap, but enabling them means getting the + // ublk_param_discard geometry exactly right or the device fails to come up; left disabled for now (the op is + // also rejected defensively in async_iov). + p.types &= ~UBLK_PARAM_TYPE_DISCARD; + + LOGINFO("HomeBlkDisk [vol={}] sectors={} lbs={} max_sectors={}", _id_str, p.basic.dev_sectors, page_size, + p.basic.max_sectors); +} + +HomeBlkDisk::~HomeBlkDisk() = default; + +HomeBlkDisk::prepare_result HomeBlkDisk::prepare(ublksrv_queue const*, int const) { + // No per-queue state: completions arrive via msg_ring straight onto the queue's own ring, dispatched by + // run_queue_loop. We only need one cqe_state pool slot per tag for the per-IO state build_cqe_state_data + // allocates (the state run_queue_loop resumes on completion). + return {.max_sqes_per_io = 1}; +} + +ublkpp::disk_task< int > HomeBlkDisk::async_iov(ublksrv_queue const* q, ublk_io_data const* data, iovec* iovecs, + uint32_t nr_vecs, uint64_t addr) { + auto const op = ublksrv_get_op(data->iod); + if (op == UBLK_IO_OP_FLUSH) co_return 0; // homeblocks IO is durable on completion; nothing to flush + if (op == UBLK_IO_OP_DISCARD || op == UBLK_IO_OP_WRITE_ZEROES) co_return -ENOTSUP; // TODO: async_unmap + + bool const is_read = (op == UBLK_IO_OP_READ); + + // Copy the iovec descriptors into an sg_list (moved into run_hb_io's frame) before the first co_await. iov_base + // points into the kernel-mapped ublk IO buffer and stays valid until ublksrv_complete_io, so homeblocks + // reads/writes it in place -- only the descriptors are copied, not the data. + sisl::sg_list sgs; + sgs.size = 0; + for (uint32_t i = 0; i < nr_vecs; ++i) { + sgs.iovs.push_back(iovecs[i]); + sgs.size += iovecs[i].iov_len; + } + + // Per-IO cqe_state in the tag's pool. `state_ud` is its managed-encoded pointer; the homeblocks completion + // carries it back via msg_ring, and run_queue_loop decodes it to resume `state` (writing state->_result = + // cqe->res, i.e. our IO result) -- all on this queue thread. We touch `state` only here and there: no lock. + auto const sc = ublkpp::build_cqe_state_data(data); + auto* const state = sc.first; + uint64_t const state_ud = sc.second; + int const queue_ring_fd = q->ring_ptr->ring_fd; + + // Launch the homeblocks IO ON a worker reactor -- homestore is reactor-affine; driving it inline on this ublk + // queue thread corrupts the index and loses wakeups. The op runs and completes on reactors, then posts its + // result straight back to THIS queue's ring (see run_hb_io). + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, + [vol = _vol, addr, sgs = std::move(sgs), is_read, queue_ring_fd, state_ud]() { + detail::detach(run_hb_io(vol, addr, sgs, is_read, queue_ring_fd, state_ud)); + }); + + co_return co_await *state; +} + +} // namespace homeblocks::ublk diff --git a/src/test/homeblk_disk.hpp b/src/test/homeblk_disk.hpp new file mode 100644 index 0000000..b29b852 --- /dev/null +++ b/src/test/homeblk_disk.hpp @@ -0,0 +1,70 @@ +/********************************************************************************* + * Modifications Copyright 2026 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +// HomeBlkDisk: a ublkpp leaf disk (ublkpp::ublk_disk) that exposes a single homeblocks volume as a ublk block +// device (/dev/ublkbN). It bridges two async runtimes that live on different threads: +// +// - ublkpp drives each ublk queue on its own pthread with a SINGLE_ISSUER io_uring; a driver's async_iov +// coroutine runs there and its per-IO ublkpp::cqe_state must be resumed ON THE QUEUE THREAD (only it may +// touch the queue's io_uring / call ublksrv_complete_io). +// - homeblocks runs its own iomgr reactors; async_read/async_write must be driven ON a reactor (homestore is +// reactor-affine) and complete on one (the journal-commit "flush" reactor). +// +// So each IO is launched on a worker reactor (run_on_forget), and when it completes on a reactor the result is +// handed straight to the ublk queue's io_uring via IORING_OP_MSG_RING (iomanager.post_msg_ring): the kernel posts +// a CQE carrying the per-IO cqe_state pointer + result onto the queue's ring, and the queue thread's +// run_queue_loop reaps it exactly like a native completion and resumes the per-IO coroutine. No eventfd, no +// service loop, no lock -- the per-IO cqe_state is touched only on the queue thread; the kernel does the hop. + +#include +#include +#include + +#include + +#include + +struct iovec; +struct ublk_io_data; +struct ublksrv_queue; + +namespace homeblocks::ublk { + +class HomeBlkDisk : public ublkpp::ublk_disk { +public: + // `instance` keeps homeblocks alive for the disk's lifetime; `vol` is the volume to expose (its geometry -- + // capacity/page_size -- is supplied explicitly because the public API exposes no way to introspect a + // volume_handle). `max_tx` is the largest single transfer (home_blocks::max_vol_io_size()). + HomeBlkDisk(std::shared_ptr< home_blocks > instance, volume_handle vol, volume_id_t vol_id, uint64_t capacity, + uint32_t page_size, uint32_t max_tx); + ~HomeBlkDisk() override; + + std::string id() const noexcept override { return _id_str; } + + prepare_result prepare(ublksrv_queue const* q, int const iouring_device_start) override; + ublkpp::disk_task< int > async_iov(ublksrv_queue const* q, ublk_io_data const* data, iovec* iovecs, + uint32_t nr_vecs, uint64_t addr) override; + +private: + std::shared_ptr< home_blocks > _instance; + volume_handle _vol; + volume_id_t _vol_id; + uint64_t _capacity; + uint32_t _page_size; + std::string _id_str; +}; + +} // namespace homeblocks::ublk diff --git a/src/test/homeblk_ublk_cli.cpp b/src/test/homeblk_ublk_cli.cpp new file mode 100644 index 0000000..a6c7642 --- /dev/null +++ b/src/test/homeblk_ublk_cli.cpp @@ -0,0 +1,206 @@ +/********************************************************************************* + * Modifications Copyright 2026 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +// +// homeblk_ublk: a ublkpp_disk-style CLI that brings up a homeblocks instance, creates (or recovers) a +// volume, and exposes it as a /dev/ublkbN block device via the HomeBlkDisk adapter. Mirrors ublkpp's +// example/ublkpp_disk.cpp -- run it, point fio/dd/mkfs at the printed device path, ^C to tear down. +// +// sudo ./homeblk_ublk --device /dev/nvme0n1 --vol_size_mb 4096 +// sudo ./homeblk_ublk --device hb.dev --create_device --dev_size_mb 8192 --data_chunk_size_mb 512 +// +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "homeblk_disk.hpp" +#include "coro_helpers.hpp" // homeblocks::detail::sync_get + +SISL_OPTION_GROUP( + homeblk_ublk, + (vol_id, "", "vol_id", "Volume UUID to expose (recovered if it exists, else created)", + ::cxxopts::value< std::string >(), ""), + (vol_size_mb, "", "vol_size_mb", "Volume size in MB (when creating)", + ::cxxopts::value< uint64_t >()->default_value("1024"), ""), + (page_size, "", "page_size", "Volume logical block / page size in bytes", + ::cxxopts::value< uint32_t >()->default_value("4096"), ""), + (device, "", "device", "homeblocks backing device(s)", ::cxxopts::value< std::vector< std::string > >(), + "[,...]"), + (create_device, "", "create_device", "Create the backing device(s) as files of --dev_size_mb", + ::cxxopts::value< bool >()->default_value("false"), ""), + (dev_size_mb, "", "dev_size_mb", "Size of each created backing file in MB", + ::cxxopts::value< uint64_t >()->default_value("8192"), ""), + (num_threads, "", "num_threads", "homeblocks iomgr reactor count", + ::cxxopts::value< uint32_t >()->default_value("2"), ""), + (app_mem_size_mb, "", "app_mem_size_mb", "homeblocks memory budget in MB", + ::cxxopts::value< uint64_t >()->default_value("4096"), ""), + (data_chunk_size_mb, "", "data_chunk_size_mb", "homeblocks data chunk size in MB (for small devices)", + ::cxxopts::value< uint32_t >(), ""), + (index_chunk_size_mb, "", "index_chunk_size_mb", "homeblocks index chunk size in MB (for small devices)", + ::cxxopts::value< uint32_t >(), ""), + (device_id, "", "device_id", "ublk device id: -1 to assign, >=0 to recover a preserved device", + ::cxxopts::value< int32_t >()->default_value("-1"), "")) + +// `homeblocks` is a logging module (SISL_LOGGING_INIT below), NOT an options group, so it does not +// appear here. homeblocks reads its tunables (data/index_chunk_size_mb) out of the homeblk_ublk group. +// `ublkpp_tgt` is ublkpp's own target option group (nr_hw_queues, feature_zero_copy, ...) -- ublkpp_tgt::run +// reads it, so it must be enabled and loaded here just as ublkpp's own example/ublkpp_disk.cpp does. +#define ENABLED_OPTIONS logging, ublkpp_tgt, homeblk_ublk + +SISL_OPTIONS_ENABLE(ENABLED_OPTIONS) + +SISL_LOGGING_INIT(homeblocks, UBLKPP_LOG_MODS) + +using namespace homeblocks; + +namespace { +constexpr uint64_t Mi = 1024ULL * 1024ULL; + +// Clean shutdown plumbing (same shape as ublkpp_disk.cpp). +std::promise< int > s_stop_code; + +void handle_signal(int sig) { + switch (sig) { + case SIGINT: + [[fallthrough]]; + case SIGTERM: + try { + LOGWARN("SIGNAL: {}", strsignal(sig)); + s_stop_code.set_value(sig); + } catch (std::future_error const& e) { LOGERROR("Failed to set stop code: {}", e.what()); } + break; + default: + LOGERROR("Unhandled SIGNAL: {}", strsignal(sig)); + break; + } +} + +std::vector< std::string > resolve_devices() { + if (0 == SISL_OPTIONS.count("device")) { + LOGERROR("At least one --device is required"); + return {}; + } + auto devices = SISL_OPTIONS["device"].as< std::vector< std::string > >(); + if (SISL_OPTIONS["create_device"].as< bool >()) { + auto const sz = SISL_OPTIONS["dev_size_mb"].as< uint64_t >() * Mi; + for (auto const& d : devices) { + if (std::filesystem::exists(d)) std::filesystem::remove(d); + std::ofstream ofs{d, std::ios::binary | std::ios::out | std::ios::trunc}; + std::filesystem::resize_file(d, sz); + LOGINFO("Created backing file {} ({} MB)", d, sz / Mi); + } + } + return devices; +} + +home_blocks_config make_config(std::vector< std::string > const& devices) { + home_blocks_config cfg; + cfg.threads = SISL_OPTIONS["num_threads"].as< uint32_t >(); + cfg.app_mem_size_mb = SISL_OPTIONS["app_mem_size_mb"].as< uint64_t >(); + for (auto const& d : devices) { + cfg.devices.emplace_back(d); + } + auto const id = boost::uuids::random_generator()(); + cfg.on_svc_id = [id]() -> async_result< peer_id_t > { co_return id; }; + return cfg; +} + +// Look up the volume if it already exists, otherwise create it with the requested geometry. +volume_handle open_or_create_volume(std::shared_ptr< home_blocks > const& hb, volume_id_t const& vid, uint64_t capacity, + uint32_t page_size) { + if (auto existing = hb->get_volume(vid); existing) { + LOGINFO("Recovered existing volume {}", boost::uuids::to_string(vid)); + return existing.value(); + } + volume_info info{vid, capacity, page_size, fmt::format("ublk_{}", boost::uuids::to_string(vid).substr(0, 8))}; + auto created = detail::sync_get(hb->create_volume(std::move(info))); + if (!created) { + LOGERROR("create_volume failed: {}", created.error().message()); + return nullptr; + } + LOGINFO("Created volume {} ({} MB, page_size {})", boost::uuids::to_string(vid), capacity / Mi, page_size); + return created.value(); +} +} // namespace + +int main(int argc, char* argv[]) { + SISL_OPTIONS_LOAD(argc, argv, ENABLED_OPTIONS); + sisl::logging::SetLogger(std::string(argv[0])); + spdlog::set_pattern("[%D %T] [%^%l%$] [%n] [%t] %v"); + + signal(SIGINT, handle_signal); + signal(SIGTERM, handle_signal); + auto exit_future = s_stop_code.get_future(); + + auto const devices = resolve_devices(); + if (devices.empty()) { + std::cout << SISL_PARSER.help({}) << std::endl; + return EINVAL; + } + + // Bring up homeblocks (this also starts iomgr internally). + auto hb_res = init_homeblocks(make_config(devices)); + if (!hb_res) { + LOGERROR("init_homeblocks failed: {}", hb_res.error().message()); + return EIO; + } + auto hb = hb_res.value(); + + auto const vid = (0 < SISL_OPTIONS.count("vol_id")) + ? boost::uuids::string_generator()(SISL_OPTIONS["vol_id"].as< std::string >()) + : boost::uuids::random_generator()(); + auto const page_size = SISL_OPTIONS["page_size"].as< uint32_t >(); + auto const capacity = SISL_OPTIONS["vol_size_mb"].as< uint64_t >() * Mi; + + int rc = 0; + if (auto vol = open_or_create_volume(hb, vid, capacity, page_size); vol) { + auto disk = std::make_shared< ublk::HomeBlkDisk >(hb, vol, vid, capacity, page_size, + static_cast< uint32_t >(hb->max_vol_io_size())); + auto run = ublkpp::ublkpp_tgt::run(vid, std::move(disk), SISL_OPTIONS["device_id"].as< int32_t >()); + if (run) { + auto target = std::move(run.value()); + LOGINFO("Volume {} is live at {}", boost::uuids::to_string(vid), target->device_path().native()); + std::cout << "homeblocks volume exposed at: " << target->device_path().native() << std::endl; + + rc = exit_future.get(); // block until SIGINT/SIGTERM + LOGINFO("Shutting down ublk target"); + ublkpp::ublkpp_tgt::remove(std::move(target)); + } else { + LOGERROR("ublkpp_tgt::run failed: {}", run.error().message()); + rc = EIO; + } + } else { + rc = EIO; + } + + LOGINFO("Shutting down homeblocks"); + hb->shutdown(); + hb.reset(); + return rc; +} diff --git a/src/test/tsan_suppressions.txt b/src/test/tsan_suppressions.txt new file mode 100644 index 0000000..86d53dc --- /dev/null +++ b/src/test/tsan_suppressions.txt @@ -0,0 +1,90 @@ +# TSAN suppressions for known false positives in homeblocks' dependency stack. +# +# All lock-order-inversions here are the GCC 13 coroutine resume mutex (an internal +# unique_lock inside coroutine_handle<>::resume()) interacting with sisl::when_all's +# nested start_detached fan-out. The same cycle (M0 = global coroutine mutex) appears in +# every report; there is no homeblocks lock involved. +# +# All data races are stdexec / std::future / spdlog / sisl-metrics interactions whose +# synchronization is invisible to TSAN. + +# --- lock-order-inversions --------------------------------------------------- + +# GCC 13 coroutine_handle::resume() internal mutex vs sisl::when_all nested scheduling +deadlock:fan_run_one + +# iomgr/homestore internal mutex ordering during init β€” TSAN sees the homeblocks +# call site but the mutexes are all iomgr-owned; false positive from test restarts +# creating new HomeBlocksImpl instances (TSAN's lock-order graph persists across +# object lifetimes and confuses mutexes from separate instances). +deadlock:homeblocks::init_homeblocks + +# --- data races -------------------------------------------------------------- + +# GCC 13 std::future/_State_baseV2 state touched concurrently by stdexec completion. +# Entries without wildcards: exact/substring match for short names. +# Entries with wildcards: glob match for template-instantiated long names +# (_M_ptr(), swap, _Sp_counted_base release, etc. all embed __future_base in args). +race:std::__future_base::_State_baseV2 +race:std::promise +race:*__future_base* +race:*_Sp_counted_base* + +# stdexec type-erased sender/scheduler storage (stdexec memory-model-correct, TSAN blind). +# Glob forms are required because TSAN matches against the full demangled name including +# all template arguments (e.g. exec::__any::__immovable_storage::__t). +race:exec::__any::__immovable_storage +race:exec::__any::__rec::__ref +race:exec::__any::__storage +race:*exec::__any* + +# stdexec stop-token (inplace_stop_source/token/callback β€” memory-model-correct, TSAN blind). +# Glob form covers _Optional_payload_base> and +# __atomic_base::load() triggered by inplace_stop_source destructor. +race:stdexec::inplace_stop +race:*inplace_stop* + +# spdlog / fmtlib log buffer (iomgr synchronizes log-path, TSAN cannot track it). +# basic_memory_buffer and detail::buffer::append are both spdlog formatting paths. +race:fmt::v12::basic_memory_buffer +race:*fmt::v12* + +# std::basic_string races inside spdlog formatting (same root cause as fmt races above). +race:*basic_string* + +# sisl metrics global registry (sisl's registry mutex is not visible to TSAN) +race:homeblocks::VolumeMetrics::VolumeMetrics + +# iomgr run_on_forget / _run_forget β€” lambda alloc vs free races across reactor handoff; +# iomgr synchronizes the lifetime but the coroutine + reactor boundary is TSAN-opaque. +# operator new/delete appear at frame #0 when the lambda object straddles reactor handoff. +race:run_on_forget +race:iomgr::IOManager::_run_forget +race:*tsan_new_delete* +race:*_Function_base* + +# iomgr file-descriptor reuse across test restarts: TSAN tracks fd numbers as a single +# resource across their lifetime. When fd N is closed in test 1 and the OS recycles N +# for a different device in test 2, TSAN conflates the two. iomgr joins all threads +# before restarting (confirmed by "All IO threads have stopped" in the logs), so there +# is no concurrent access β€” just fd-number aliasing between separate test cases. +race:iomgr::IODevice::close +race:iomgr::IOReactorEPoll::remove_iodev_impl + +# homestore BtreeNode concurrent access β€” node-level write lock is homestore-internal +# and invisible to TSAN; no homeblocks lock involved +race:homestore::BtreeNode + +# homestore log group / replication journal β€” homestore-internal async dispatch; +# synchronization across log_group.cpp and common.cpp is TSAN-opaque +race:homestore::LogGroup::add_record +race:homestore::repl_req_ctx::create_journal_entry + +# homestore IndexTable node read β€” same btree locking invisibility as BtreeNode above +race:homestore::IndexTable + +# stdexec exec::__task move / await_transform (memory-model-correct, TSAN blind) +race:exec::__task::basic_task + +# stdexec run_loop scheduler storage construction (memory-model-correct, TSAN blind) +race:stdexec::__loop::run_loop