diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index 7a3ac17b054..5ae19512cc4 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -134,19 +134,6 @@ z_swadm () { pfexec zlogin oxz_switch /opt/oxide/dendrite/bin/swadm $@ } -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script -OPTE_COMMIT="" -if [[ "x$OPTE_COMMIT" != "x" ]]; then - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm - chmod +x opteadm - cp opteadm /tmp/opteadm - pfexec mv opteadm /opt/oxide/opte/bin/opteadm -fi # # XXX work around 14537 (UFS should not allow directories to be unlinked) which @@ -197,6 +184,24 @@ ptime -m tar xvzf /input/package/work/package.tar.gz # shellcheck source=/dev/null source .github/buildomat/ci-env.sh +# Source the OPTE override (if any) from the canonical location and apply it. +# +# When set, download the xde driver and opteadm directly from buildomat and +# swap them in. The deploy target is a ramdisk image without pkg(5), so we +# use rem_drv/add_drv instead of the p5p approach used by install_opte.sh +# and releng. +source tools/opte_version_override +if [[ "x$OPTE_COMMIT" != "x" ]]; then + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde" + pfexec rem_drv xde || true + pfexec mv xde /kernel/drv/amd64/xde + pfexec add_drv xde || true + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm" + chmod +x opteadm + cp opteadm /tmp/opteadm + pfexec mv opteadm /opt/oxide/opte/bin/opteadm +fi + # Ask buildomat for the range of extra addresses that we're allowed to use, and # break them up into the ranges we need. diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index b43b91e9ec4..78df41dc5f6 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -60,5 +60,7 @@ files=( target/release/xtask target/debug/bootstrap tests/* + tools/opte_version + tools/opte_version_override ) ptime -m tar cvzf $WORK/package.tar.gz "${files[@]}" "${packages[@]}" diff --git a/.github/workflows/check-opte-ver.yml b/.github/workflows/check-opte-ver.yml index e516eeacbe6..65a3b23c121 100644 --- a/.github/workflows/check-opte-ver.yml +++ b/.github/workflows/check-opte-ver.yml @@ -1,10 +1,7 @@ name: check-opte-ver on: pull_request: - paths: - - '.github/workflows/check-opte-ver.yml' - - 'Cargo.toml' - - 'tools/opte_version' + branches: [main] jobs: check-opte-ver: runs-on: ubuntu-22.04 @@ -18,3 +15,22 @@ jobs: run: cargo install toml-cli@0.2.3 - name: Check OPTE version and rev match run: ./tools/ci_check_opte_ver.sh + + # Runs on every PR regardless of paths changed, since the override + # file could have been set in an earlier commit and slip through on + # an unrelated PR otherwise. + check-opte-override: + if: github.base_ref == 'main' + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461 + - name: Reject OPTE override on main + run: | + source tools/opte_version_override + if [[ "x$OPTE_COMMIT" != "x" ]]; then + echo "::error::OPTE_COMMIT is set in tools/opte_version_override." + echo "::error::The OPTE override must be cleared before merging to main." + exit 1 + fi diff --git a/Cargo.lock b/Cargo.lock index 8b7a39695cf..5f2f24a9239 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -275,9 +275,9 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.1.2" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c5bcfa8749ac45dd12cb11055aeeb6b27a3895560d60d71e3c23bf979e60514" +checksum = "39bae1d3fa576f7c6519514180a72559268dd7d1fe104070956cb687bc6673bd" dependencies = [ "anstyle", "bstr", @@ -1239,9 +1239,9 @@ dependencies = [ [[package]] name = "cfg-expr" -version = "0.20.6" +version = "0.20.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78cef5b5a1a6827c7322ae2a636368a573006b27cfa76c7ebd53e834daeaab6a" +checksum = "3c6b04e07d8080154ed4ac03546d9a2b303cc2fe1901ba0b35b301516e289368" dependencies = [ "smallvec 1.15.1", "target-lexicon", @@ -1741,10 +1741,11 @@ dependencies = [ [[package]] name = "common" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/dendrite?rev=1ddaa5d6b101fbaa2c29eca847111cbef1a272ad#1ddaa5d6b101fbaa2c29eca847111cbef1a272ad" +source = "git+https://github.com/oxidecomputer/dendrite?rev=cc8e02a0800034c431c8cf96b889ea638da3d194#cc8e02a0800034c431c8cf96b889ea638da3d194" dependencies = [ "anyhow", "chrono", + "oximeter 0.1.0 (git+https://github.com/oxidecomputer/omicron?branch=main)", "oxnet", "rand 0.9.2", "schemars 0.8.22", @@ -1762,11 +1763,10 @@ dependencies = [ [[package]] name = "common" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/dendrite?rev=cc8e02a0800034c431c8cf96b889ea638da3d194#cc8e02a0800034c431c8cf96b889ea638da3d194" +source = "git+https://github.com/oxidecomputer/dendrite?rev=e10e4f5a993fe950ab1b478abb5dcbfa7aa92091#e10e4f5a993fe950ab1b478abb5dcbfa7aa92091" dependencies = [ "anyhow", "chrono", - "oximeter 0.1.0 (git+https://github.com/oxidecomputer/omicron?branch=main)", "oxnet", "rand 0.9.2", "schemars 0.8.22", @@ -2525,7 +2525,7 @@ dependencies = [ [[package]] name = "ddm-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=4d1f20f793da102b29b914569725ebc9fdf746dd#4d1f20f793da102b29b914569725ebc9fdf746dd" +source = "git+https://github.com/oxidecomputer/maghemite?rev=c3c3032f8bdc91d6faf2b36e05b8375a0980765c#c3c3032f8bdc91d6faf2b36e05b8375a0980765c" dependencies = [ "oxnet", "progenitor 0.13.0", @@ -3108,18 +3108,18 @@ dependencies = [ [[package]] name = "dpd-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/dendrite?rev=1ddaa5d6b101fbaa2c29eca847111cbef1a272ad#1ddaa5d6b101fbaa2c29eca847111cbef1a272ad" +source = "git+https://github.com/oxidecomputer/dendrite?rev=cc8e02a0800034c431c8cf96b889ea638da3d194#cc8e02a0800034c431c8cf96b889ea638da3d194" dependencies = [ "async-trait", "chrono", - "common 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=1ddaa5d6b101fbaa2c29eca847111cbef1a272ad)", + "common 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=cc8e02a0800034c431c8cf96b889ea638da3d194)", "crc8", "futures", "http", "oxnet", - "progenitor 0.13.0", + "progenitor 0.11.2", "regress", - "reqwest 0.13.2", + "reqwest 0.12.28", "schemars 0.8.22", "serde", "serde_json", @@ -3132,18 +3132,18 @@ dependencies = [ [[package]] name = "dpd-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/dendrite?rev=cc8e02a0800034c431c8cf96b889ea638da3d194#cc8e02a0800034c431c8cf96b889ea638da3d194" +source = "git+https://github.com/oxidecomputer/dendrite?rev=e10e4f5a993fe950ab1b478abb5dcbfa7aa92091#e10e4f5a993fe950ab1b478abb5dcbfa7aa92091" dependencies = [ "async-trait", "chrono", - "common 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=cc8e02a0800034c431c8cf96b889ea638da3d194)", + "common 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=e10e4f5a993fe950ab1b478abb5dcbfa7aa92091)", "crc8", "futures", "http", "oxnet", - "progenitor 0.11.2", + "progenitor 0.13.0", "regress", - "reqwest 0.12.28", + "reqwest 0.13.2", "schemars 0.8.22", "serde", "serde_json", @@ -6200,11 +6200,11 @@ dependencies = [ [[package]] name = "libtest-mimic" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" +checksum = "14e6ba06f0ade6e504aff834d7c34298e5155c6baca353cc6a4aaff2f9fd7f33" dependencies = [ - "anstream 0.6.21", + "anstream 1.0.0", "anstyle", "clap", "escape8259", @@ -6247,18 +6247,18 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" [[package]] name = "linkme" -version = "0.3.35" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e3283ed2d0e50c06dd8602e0ab319bb048b6325d0bba739db64ed8205179898" +checksum = "e83272d46373fb8decca684579ac3e7c8f3d71d4cc3aa693df8759e260ae41cf" dependencies = [ "linkme-impl", ] [[package]] name = "linkme-impl" -version = "0.3.35" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5cec0ec4228b4853bb129c84dbf093a27e6c7a20526da046defc334a1b017f7" +checksum = "32d59e20403c7d08fe62b4376edfe5c7fb2ef1e6b1465379686d0f21c8df444b" dependencies = [ "proc-macro2", "quote", @@ -6503,7 +6503,7 @@ dependencies = [ [[package]] name = "mg-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=4d1f20f793da102b29b914569725ebc9fdf746dd#4d1f20f793da102b29b914569725ebc9fdf746dd" +source = "git+https://github.com/oxidecomputer/maghemite?rev=c3c3032f8bdc91d6faf2b36e05b8375a0980765c#c3c3032f8bdc91d6faf2b36e05b8375a0980765c" dependencies = [ "chrono", "colored 3.1.1", @@ -7648,7 +7648,7 @@ dependencies = [ "chrono", "crucible-agent-client", "dns-service-client", - "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=1ddaa5d6b101fbaa2c29eca847111cbef1a272ad)", + "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=e10e4f5a993fe950ab1b478abb5dcbfa7aa92091)", "dropshot 0.16.7", "futures", "gateway-messages", @@ -8594,7 +8594,7 @@ dependencies = [ "display-error-chain", "dns-server", "dns-service-client", - "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=1ddaa5d6b101fbaa2c29eca847111cbef1a272ad)", + "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=e10e4f5a993fe950ab1b478abb5dcbfa7aa92091)", "dropshot 0.16.7", "ereport-types", "expectorate", @@ -8660,6 +8660,7 @@ dependencies = [ "num-integer", "omicron-cockroach-metrics", "omicron-common", + "omicron-ddm-admin-client", "omicron-passwords", "omicron-rpaths", "omicron-sled-agent", @@ -9231,6 +9232,7 @@ dependencies = [ "reqwest 0.13.2", "ring", "rustls 0.22.4", + "schemars 0.8.22", "serde", "sha2", "slog", @@ -11698,7 +11700,7 @@ dependencies = [ [[package]] name = "rdb-types" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=4d1f20f793da102b29b914569725ebc9fdf746dd#4d1f20f793da102b29b914569725ebc9fdf746dd" +source = "git+https://github.com/oxidecomputer/maghemite?rev=c3c3032f8bdc91d6faf2b36e05b8375a0980765c#c3c3032f8bdc91d6faf2b36e05b8375a0980765c" dependencies = [ "oxnet", "schemars 0.8.22", @@ -14505,9 +14507,9 @@ checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" [[package]] name = "target-spec" -version = "3.5.7" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c173ce474b6257cfb2a107949e48eb1ab9cae21cecbdf13401ae3be4a411a" +checksum = "b00e973676af5497c2a69cc9787e2205c00f3b6f4f70e7d7b0112e28aa84b501" dependencies = [ "cfg-expr", "guppy-workspace-hack", @@ -16697,7 +16699,7 @@ name = "wicket-common" version = "0.1.0" dependencies = [ "anyhow", - "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=1ddaa5d6b101fbaa2c29eca847111cbef1a272ad)", + "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=e10e4f5a993fe950ab1b478abb5dcbfa7aa92091)", "dropshot 0.16.7", "gateway-client", "gateway-types", @@ -16762,7 +16764,7 @@ dependencies = [ "clap", "debug-ignore", "display-error-chain", - "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=1ddaa5d6b101fbaa2c29eca847111cbef1a272ad)", + "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=e10e4f5a993fe950ab1b478abb5dcbfa7aa92091)", "dropshot 0.16.7", "either", "expectorate", diff --git a/Cargo.toml b/Cargo.toml index fa16fa91492..d4cd540a36a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -493,7 +493,7 @@ digest = "0.10.7" dns-server = { path = "dns-server" } dns-server-api = { path = "dns-server-api" } dns-service-client = { path = "clients/dns-service-client" } -dpd-client = { git = "https://github.com/oxidecomputer/dendrite", rev = "1ddaa5d6b101fbaa2c29eca847111cbef1a272ad" } +dpd-client = { git = "https://github.com/oxidecomputer/dendrite", rev = "e10e4f5a993fe950ab1b478abb5dcbfa7aa92091" } dropshot = { version = "0.16.6", features = [ "usdt-probes" ] } dropshot-api-manager = "0.6.0" dropshot-api-manager-types = "0.6.0" @@ -599,8 +599,8 @@ ntp-admin-api = { path = "ntp-admin/api" } ntp-admin-client = { path = "clients/ntp-admin-client" } ntp-admin-types = { path = "ntp-admin/types" } ntp-admin-types-versions = { path = "ntp-admin/types/versions" } -mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "4d1f20f793da102b29b914569725ebc9fdf746dd" } -ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "4d1f20f793da102b29b914569725ebc9fdf746dd" } +mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "c3c3032f8bdc91d6faf2b36e05b8375a0980765c" } +ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "c3c3032f8bdc91d6faf2b36e05b8375a0980765c" } multimap = "0.10.1" nexus-auth = { path = "nexus/auth" } nexus-background-task-interface = { path = "nexus/background-task-interface" } @@ -737,7 +737,7 @@ rats-corim = { git = "https://github.com/oxidecomputer/rats-corim.git", rev = "f raw-cpuid = { git = "https://github.com/oxidecomputer/rust-cpuid.git", rev = "a4cf01df76f35430ff5d39dc2fe470bcb953503b" } rayon = "1.10" rcgen = "0.12.1" -rdb-types = { git = "https://github.com/oxidecomputer/maghemite", rev = "4d1f20f793da102b29b914569725ebc9fdf746dd" } +rdb-types = { git = "https://github.com/oxidecomputer/maghemite", rev = "c3c3032f8bdc91d6faf2b36e05b8375a0980765c" } reconfigurator-cli = { path = "dev-tools/reconfigurator-cli" } reedline = "0.40.0" ref-cast = "1.0" diff --git a/clients/ddm-admin-client/src/lib.rs b/clients/ddm-admin-client/src/lib.rs index 7a8b56d499d..b8815d17473 100644 --- a/clients/ddm-admin-client/src/lib.rs +++ b/clients/ddm-admin-client/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2023 Oxide Computer Company +// Copyright 2026 Oxide Computer Company #![allow(clippy::redundant_closure_call)] #![allow(clippy::needless_lifetimes)] @@ -107,6 +107,40 @@ impl Client { self.inner.enable_stats(request).await.map(|resp| resp.into_inner()) } + /// Returns DDM peer information including interface names. + /// + /// The `if_name` field on each peer provides a live sled-to-port + /// mapping, identifying which switch port a peer sled is connected + /// through (e.g., `"tfportrear0_0"`). + pub async fn get_peers( + &self, + ) -> Result< + std::collections::HashMap, + Error, + > { + self.inner.get_peers().await.map(|resp| resp.into_inner()) + } + + /// Returns multicast routes learned from DDM peers. + /// + /// Each route includes the origin (overlay/underlay mapping), + /// the nexthop peer that advertised it, and the path vector. + pub async fn get_multicast_groups( + &self, + ) -> Result, Error> { + self.inner.get_multicast_groups().await.map(|resp| resp.into_inner()) + } + + /// Returns multicast origins that this DDM instance is advertising. + pub async fn get_originated_multicast_groups( + &self, + ) -> Result, Error> { + self.inner + .get_originated_multicast_groups() + .await + .map(|resp| resp.into_inner()) + } + /// Returns the addresses of connected sleds. /// /// Note: These sleds have not yet been verified. diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index d2c47534b3a..aa2fde293fd 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -2543,6 +2543,8 @@ impl Vni { /// /// This is a low-numbered VNI to avoid colliding with user VNIs. /// However, it is not in the Oxide-reserved range yet. + /// + /// Should match `oxide_vpc::api::DEFAULT_MULTICAST_VNI`. pub const DEFAULT_MULTICAST_VNI: Self = Self(77); /// Oxide reserves a slice of initial VNIs for its own use. diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index e2274ddd2d5..7b08511f2e5 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -29,6 +29,7 @@ Crucible Pantry (client: crucible-pantry-client) Maghemite DDM Admin (client: ddm-admin-client) consumed by: installinator (omicron/installinator) via 1 path consumed by: mgd (maghemite/mgd) via 1 path + consumed by: omicron-nexus (omicron/nexus) via 1 path consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path consumed by: wicketd (omicron/wicketd) via 1 path diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 30eeb5158d6..437765d19d2 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -656,7 +656,7 @@ task: "bfd_manager" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - last completion reported error: failed to resolve addresses for Dendrite services: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + last completion reported error: failed to resolve addresses for Dendrite services: proto error: no records found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } task: "blueprint_planner" configured period: every m @@ -1342,7 +1342,7 @@ task: "bfd_manager" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - last completion reported error: failed to resolve addresses for Dendrite services: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + last completion reported error: failed to resolve addresses for Dendrite services: proto error: no records found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } task: "blueprint_planner" configured period: every m diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index 874f25e3fe7..2569884a13a 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -270,6 +270,18 @@ async fn main() -> Result<()> { let opte_version = fs::read_to_string(WORKSPACE_DIR.join("tools/opte_version")).await?; + // Parse tools/opte_version_override for OPTE_COMMIT. When set, we + // download the override p5p from buildomat and use it as a package + // source during image build instead of the helios pkg repo version. + let opte_override = parse_opte_version_override( + &WORKSPACE_DIR.join("tools/opte_version_override"), + ) + .await?; + if let Some(ov) = &opte_override { + info!(logger, "OPTE override active: commit={}", ov.commit); + } + let opte_version = opte_version.trim(); + let client = reqwest::ClientBuilder::new() .connect_timeout(Duration::from_secs(15)) .timeout(Duration::from_secs(120)) @@ -617,7 +629,7 @@ async fn main() -> Result<()> { .arg("-o") // output directory for image .arg(args.output_dir.join(format!("os-{}", target))) .arg("-F") // pass extra image builder features - .arg(format!("optever={}", opte_version.trim())) + .arg(format!("optever={opte_version}")) .arg("-P") // include all files from extra proto area .arg(proto_dir.join("root")) .arg("-N") // image name @@ -675,11 +687,33 @@ async fn main() -> Result<()> { .arg(format!("helios-dev={HELIOS_PKGREPO}")) } - // helios-build experiment-image - jobs.push_command(format!("{}-image", target), image_cmd) + // When OPTE_COMMIT is set, download the override p5p from buildomat + // and add it as a package source for the image build. + if let Some(ov) = &opte_override { + let p5p_path = tempdir.path().join(format!("opte-{target}.p5p")); + let commit = ov.commit.clone(); + let dest = p5p_path.clone(); + let cl = client.clone(); + let log = logger.clone(); + jobs.push( + format!("{target}-opte-p5p"), + download_opte_p5p(log, cl, commit, dest), + ); + + image_cmd = image_cmd + .arg("-p") + .arg(format!("helios-dev=file://{p5p_path}")); + } + + let image_job = jobs + .push_command(format!("{target}-image"), image_cmd) .after("helios-setup") .after("helios-incorp") - .after(format!("{}-proto", target)); + .after(format!("{target}-proto")); + + if opte_override.is_some() { + image_job.after(format!("{target}-opte-p5p")); + } } // Build the recovery target after we build the host target. Only one // of these will build at a time since Cargo locks its target directory; @@ -887,6 +921,73 @@ async fn build_proto_area( Ok(()) } +/// Parsed contents of `tools/opte_version_override` when an override is active. +struct OpteOverride { + commit: String, +} + +/// Parse `tools/opte_version_override` for `OPTE_COMMIT`. Returns `None` if +/// `OPTE_COMMIT` is unset or empty. +async fn parse_opte_version_override( + path: &Utf8PathBuf, +) -> Result> { + let contents = fs::read_to_string(path) + .await + .context("failed to read tools/opte_version_override")?; + + for line in contents.lines() { + let line = line.trim(); + if let Some(val) = line.strip_prefix("OPTE_COMMIT=") { + let val = val.trim_matches('"'); + if !val.is_empty() { + return Ok(Some(OpteOverride { commit: val.to_string() })); + } + } + } + + Ok(None) +} + +const OPTE_BUILDOMAT_BASE: &str = + "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte"; + +/// Download the OPTE override p5p archive from buildomat. +async fn download_opte_p5p( + logger: Logger, + client: reqwest::Client, + commit: String, + dest: Utf8PathBuf, +) -> Result<()> { + let url = format!("{OPTE_BUILDOMAT_BASE}/repo/{commit}/opte.p5p"); + info!(logger, "downloading OPTE override p5p from {url}"); + for attempt in 1..=RETRY_ATTEMPTS { + let result = async { + let response = client.get(&url).send().await?.error_for_status()?; + let bytes = response.bytes().await?; + fs::write(&dest, &bytes).await?; + Ok::<_, anyhow::Error>(()) + } + .await; + + match result { + Ok(()) => { + info!(logger, "downloaded OPTE p5p to {dest}"); + return Ok(()); + } + Err(err) => { + if attempt == RETRY_ATTEMPTS { + return Err(err).with_context(|| { + format!("failed to download OPTE p5p from {url}") + }); + } + info!(logger, "retrying OPTE p5p download (attempt {attempt})"); + } + } + } + + bail!("failed to download OPTE p5p after {RETRY_ATTEMPTS} attempts") +} + async fn host_add_root_profile(host_proto_root: Utf8PathBuf) -> Result<()> { fs::create_dir_all(&host_proto_root).await?; fs::write( diff --git a/illumos-utils/src/opte/illumos.rs b/illumos-utils/src/opte/illumos.rs index 28ca9f85566..f17adacf52a 100644 --- a/illumos-utils/src/opte/illumos.rs +++ b/illumos-utils/src/opte/illumos.rs @@ -13,6 +13,7 @@ use sled_agent_types::inventory::NetworkInterfaceKind; use slog::Logger; use slog::info; use std::net::IpAddr; +use std::net::Ipv6Addr; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -70,6 +71,11 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error( + "address {0} is not within the underlay multicast subnet (ff04::/64)" + )] + InvalidMcastUnderlay(Ipv6Addr), } /// Delete all xde devices on the system. diff --git a/illumos-utils/src/opte/mod.rs b/illumos-utils/src/opte/mod.rs index e9e2546cb0a..780d63f44e9 100644 --- a/illumos-utils/src/opte/mod.rs +++ b/illumos-utils/src/opte/mod.rs @@ -33,14 +33,25 @@ use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; pub use port::Port; -pub use port_manager::MulticastGroupCfg; pub use port_manager::PortCreateParams; pub use port_manager::PortManager; pub use port_manager::PortTicket; +pub use sled_agent_types::multicast::MulticastGroupCfg; use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; +// `oxide_vpc::api::DEFAULT_MULTICAST_VNI` and +// `omicron_common::api::external::Vni::DEFAULT_MULTICAST_VNI` live in sibling +// crates that cannot reference each other's constant. They must stay +// numerically equal: the MRIB, M2P mappings, and OPTE all route on this +// value, so any divergence would black-hole multicast traffic. +const _: () = assert!( + oxide_vpc::api::DEFAULT_MULTICAST_VNI + == omicron_common::api::external::Vni::DEFAULT_MULTICAST_VNI.as_u32(), + "oxide_vpc::api::DEFAULT_MULTICAST_VNI must equal omicron_common Vni::DEFAULT_MULTICAST_VNI", +); + /// Information about the gateway for an OPTE port #[derive(Debug, Clone, Copy)] #[allow(dead_code)] diff --git a/illumos-utils/src/opte/non_illumos.rs b/illumos-utils/src/opte/non_illumos.rs index 42487cde09c..2170f1ace6f 100644 --- a/illumos-utils/src/opte/non_illumos.rs +++ b/illumos-utils/src/opte/non_illumos.rs @@ -2,25 +2,38 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Mock / dummy versions of the OPTE module, for non-illumos platforms +//! Mock / dummy versions of the OPTE module, for non-illumos platforms. +//! +//! Most methods are either `unimplemented!()` or silent no-ops. +//! Multicast subscribe/unsubscribe is an exception, as it maintains real +//! in-memory state because port manager tests assert on subscription contents. use crate::addrobj::AddrObject; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; -use oxide_vpc::api::Direction; +use oxide_vpc::api::DumpMcast2PhysResp; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::NoResp; use oxide_vpc::api::PortInfo; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::VpcCfg; use sled_agent_types::inventory::NetworkInterfaceKind; use slog::Logger; @@ -76,6 +89,11 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error( + "address {0} is not within the underlay multicast subnet (ff04::/64)" + )] + InvalidMcastUnderlay(std::net::Ipv6Addr), } pub fn initialize_xde_driver( @@ -172,12 +190,19 @@ pub(crate) struct PortData { pub port: PortInfo, /// The routes for this port. This simulates the router layer. pub routes: Vec, + /// Multicast group subscriptions: group IP → source filter. + pub mcast_subscriptions: HashMap, } #[derive(Debug)] pub(crate) struct State { pub ports: HashMap, pub underlay_initialized: bool, + /// Multicast-to-physical mappings, keyed on (group, underlay). + /// + /// Persisted across [`Handle`] lifetimes to simulate xde kernel state + /// surviving sled-agent restarts. + pub m2p: Vec<(oxide_vpc::api::IpAddr, MulticastUnderlay)>, } const NO_RESPONSE: NoResp = NoResp { unused: 99 }; @@ -185,7 +210,11 @@ static OPTE_STATE: OnceLock> = OnceLock::new(); fn opte_state() -> &'static Mutex { OPTE_STATE.get_or_init(|| { - Mutex::new(State { ports: HashMap::new(), underlay_initialized: false }) + Mutex::new(State { + ports: HashMap::new(), + underlay_initialized: false, + m2p: Vec::new(), + }) }) } @@ -237,7 +266,11 @@ impl Handle { return Err(OpteError::DuplicatePort(entry.key().to_string())); } Entry::Vacant(entry) => { - entry.insert(PortData { port, routes: Vec::new() }); + entry.insert(PortData { + port, + routes: Vec::new(), + mcast_subscriptions: HashMap::new(), + }); } } Ok(NO_RESPONSE) @@ -270,14 +303,46 @@ impl Handle { Ok(NO_RESPONSE) } - /// Allow traffic to / from a CIDR block on a port. - pub fn allow_cidr( + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( &self, - _: &str, - _: IpCidr, - _: Direction, + req: &McastSubscribeReq, ) -> Result { - unimplemented!("Not yet used in tests") + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.insert(group_ip, req.filter.clone()); + Ok(NO_RESPONSE) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.remove(&group_ip); + Ok(NO_RESPONSE) } /// Delete a router entry from a port. @@ -323,6 +388,64 @@ impl Handle { unimplemented!("Not yet used in tests") } + /// Set a multicast-to-physical mapping. + pub fn set_m2p(&self, req: &SetMcast2PhysReq) -> Result { + let mut state = opte_state().lock().unwrap(); + // Deduplicate by replacing existing entry for the same group. + state.m2p.retain(|(g, _)| *g != req.group); + state.m2p.push((req.group, req.underlay)); + Ok(NO_RESPONSE) + } + + /// Clear a multicast-to-physical mapping. + pub fn clear_m2p( + &self, + req: &ClearMcast2PhysReq, + ) -> Result { + let mut state = opte_state().lock().unwrap(); + state.m2p.retain(|(g, u)| !(*g == req.group && *u == req.underlay)); + Ok(NO_RESPONSE) + } + + /// Set multicast forwarding for a port. + pub fn set_mcast_fwd( + &self, + _: &SetMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Clear multicast forwarding for a port. + pub fn clear_mcast_fwd( + &self, + _: &ClearMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Dump all multicast-to-physical mappings. + pub fn dump_m2p(&self) -> Result { + let state = opte_state().lock().unwrap(); + let mut ip4 = Vec::new(); + let mut ip6 = Vec::new(); + for (group, underlay) in &state.m2p { + match group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + ip4.push((*v4, *underlay)); + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + ip6.push((*v6, *underlay)); + } + } + } + Ok(DumpMcast2PhysResp { ip4, ip6 }) + } + + /// Dump all multicast forwarding entries. + pub fn dump_mcast_fwd(&self) -> Result { + Ok(DumpMcastForwardingResp { entries: Vec::new() }) + } + /// List ports on the current system. #[allow(dead_code)] pub(crate) fn list_ports(&self) -> Result { diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index 638dd52de3d..464e1aa3709 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -4,6 +4,7 @@ //! Manager for all OPTE ports on a Helios system +use crate::addrobj::AddrObject; use crate::dladm::OPTE_LINK_PREFIX; use crate::opte::AttachedSubnet; use crate::opte::EnsureAttachedSubnetResult; @@ -17,8 +18,6 @@ use crate::opte::port::PortData; use ipnetwork::Ipv4Network; use ipnetwork::Ipv6Network; use macaddr::MacAddr6; -use omicron_common::address::IPV4_MULTICAST_RANGE; -use omicron_common::address::IPV6_MULTICAST_RANGE; use omicron_common::api::external; use omicron_common::api::internal::shared::ExternalIpGatewayMap; use omicron_common::api::internal::shared::InternetGatewayRouterTarget; @@ -35,10 +34,13 @@ use omicron_common::api::internal::shared::RouterVersion; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AttachedSubnetConfig; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::ExternalIpCfg; +use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Cfg; @@ -46,10 +48,16 @@ use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::Ipv6Cidr; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::RouterClass; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::TransitIpConfig; use oxide_vpc::api::VpcCfg; use oxnet::IpNet; @@ -61,6 +69,15 @@ use sled_agent_types::instance::ExternalIpv6Config; use sled_agent_types::instance::ResolvedVpcFirewallRule; use sled_agent_types::inventory::NetworkInterface; use sled_agent_types::inventory::NetworkInterfaceKind; +use sled_agent_types::multicast::ClearMcast2Phys; +use sled_agent_types::multicast::ClearMcastForwarding; +use sled_agent_types::multicast::Mcast2PhysMapping; +use sled_agent_types::multicast::McastFilterMode; +use sled_agent_types::multicast::McastForwardingEntry; +use sled_agent_types::multicast::McastForwardingNextHop; +use sled_agent_types::multicast::McastReplication; +use sled_agent_types::multicast::McastSourceFilter; +use sled_agent_types::multicast::MulticastGroupCfg; use slog::Logger; use slog::debug; use slog::error; @@ -73,6 +90,7 @@ use std::collections::HashSet; use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; +use std::net::UdpSocket; use std::sync::Arc; use std::sync::Mutex; use std::sync::atomic::AtomicU64; @@ -89,20 +107,36 @@ struct RouteSet { active_ports: usize, } -/// Configuration for multicast groups on an OPTE port. -/// -/// TODO: This type should be moved to [oxide_vpc::api] when OPTE dependencies -/// are updated, following the same pattern as other VPC configuration types -/// like [ExternalIpCfg], [IpCfg], etc. +/// Mutable per-port state tracked alongside the immutable `Port`. +#[derive(Debug)] +struct PortState { + port: Port, + /// Active multicast subscriptions, mapping group IP to source filter. + mcast_subscriptions: HashMap, +} + +impl PortState { + fn new(port: Port) -> Self { + Self { port, mcast_subscriptions: HashMap::new() } + } +} + +/// Convert a `MulticastGroupCfg` into OPTE's `SourceFilter`. /// -/// TODO: Eventually remove. -#[derive(Debug, Clone, PartialEq)] -pub struct MulticastGroupCfg { - /// The multicast group IP address (IPv4 or IPv6). - pub group_ip: IpAddr, - /// Source addresses for source-filtered multicast (optional for ASM, - /// required for SSM). - pub sources: Vec, +/// Empty sources maps to ASM (EXCLUDE with no entries, accepting all +/// sources). Non-empty sources maps to SSM (INCLUDE with the listed +/// sources). +fn multicast_cfg_to_source_filter(cfg: &MulticastGroupCfg) -> SourceFilter { + if cfg.sources.is_empty() { + SourceFilter::default() + } else { + SourceFilter::Include( + cfg.sources + .iter() + .map(|s| oxide_vpc::api::IpAddr::from(*s)) + .collect(), + ) + } } #[derive(Debug)] @@ -115,9 +149,10 @@ struct PortManagerInner { /// IP address of the hosting sled on the underlay. underlay_ip: Ipv6Addr, - /// Map of all ports, keyed on the interface Uuid and its kind - /// (which includes the Uuid of the parent instance or service) - ports: Mutex>, + /// Map of all ports and their mutable state, keyed on the interface + /// Uuid and its kind (which includes the Uuid of the parent instance + /// or service). + ports: Mutex>, /// Map of all current resolved routes. routes: Mutex>, @@ -127,6 +162,32 @@ struct PortManagerInner { /// /// IGW IDs are specific to the VPC of each NIC. eip_gateways: Mutex>>>, + + /// Underlay NIC interface names (e.g., "cxgbe0", "cxgbe1"). + /// + /// Used to program NIC multicast MAC filters via + /// `UdpSocket::join_multicast_v6`. + // Empty in tests where no real underlay NICs exist. + underlay_nics: Vec, + + /// UDP sockets held open to maintain NIC multicast MAC filters. + /// + /// On T6 hardware the NIC will not deliver multicast frames to + /// xde unless the corresponding multicast MAC filter is programmed. + /// Joining an IPv6 multicast group on a UDP socket causes the + /// kernel to call `mac_multicast_add` on the interface, which + /// programs the filter. The socket receives no data (xde's + /// siphon/flow hook intercepts first) and exists solely to hold + /// the filter entry. + /// + /// Dropping the socket removes the filter. + /// + /// See . + // + // Leaf lock: acquiring this while another `PortManagerInner` lock is held + // breaks the acyclic graph. The locked region must not call back + // into `PortManager`, as `std::sync::Mutex` is non-reentrant. + mcast_underlay_sockets: Mutex>, } impl PortManagerInner { @@ -147,6 +208,7 @@ pub struct PortCreateParams<'a> { pub firewall_rules: &'a [ResolvedVpcFirewallRule], pub dhcp_config: DhcpCfg, pub attached_subnets: Vec, + pub multicast_groups: &'a [MulticastGroupCfg], } impl<'a> TryFrom<&PortCreateParams<'a>> for IpCfg { @@ -341,8 +403,17 @@ pub struct PortManager { } impl PortManager { - /// Create a new manager, for creating OPTE ports - pub fn new(log: Logger, underlay_ip: Ipv6Addr) -> Self { + /// Create a new manager, for creating OPTE ports. + /// + /// When `underlay_nics` is non-empty, the constructor performs kernel + /// I/O: one ioctl to list existing M2P mappings, then one + /// `setsockopt(IPV6_JOIN_GROUP)` per mapping per NIC to rehydrate + /// multicast MAC filters. + pub fn new( + log: Logger, + underlay_ip: Ipv6Addr, + underlay_nics: &[AddrObject], + ) -> Self { let inner = Arc::new(PortManagerInner { log, next_port_id: AtomicU64::new(0), @@ -350,9 +421,108 @@ impl PortManager { ports: Mutex::new(BTreeMap::new()), routes: Mutex::new(Default::default()), eip_gateways: Mutex::new(Default::default()), + underlay_nics: underlay_nics + .iter() + .map(|n| n.interface().to_string()) + .collect(), + mcast_underlay_sockets: Mutex::new(HashMap::new()), }); - Self { inner } + let mgr = Self { inner }; + + // Rehydrate MAC filter sockets for any M2P mappings that + // survived in the xde kernel module across a sled-agent + // restart. Without this, the NIC's multicast MAC filters + // are lost when the old process exits. + // + // Eager rehydration occurs here, not a lazy approach: the Nexus + // convergence loop's `converge_m2p` treats an M2P present on both + // DB and sled as already converged and never re-applies `set_mcast_m2p`, + // so a missing MAC filter would never be healed by convergence alone. + // + // Cost: one `dump_m2p` ioctl plus one `setsockopt(IPV6_JOIN_GROUP)` + // per surviving group per underlay NIC. Bounded by active groups on + // this sled and runs only at sled-agent startup. + mgr.rehydrate_underlay_multicast_filters(); + + mgr + } + + /// Re-open underlay multicast filter sockets for M2P mappings + /// that already exist in the xde kernel module. + /// + /// Called at startup to cover the sled-agent restart case where + /// OPTE kernel state persists but userspace socket state is lost. + /// + /// On a cold boot (no prior xde state), `list_mcast_m2p` returns + /// an error or an empty list. + fn rehydrate_underlay_multicast_filters(&self) { + if self.inner.underlay_nics.is_empty() { + return; + } + + let mappings = match self.list_mcast_m2p() { + Ok(m) => m, + Err(e) => { + // Expected on cold boot when xde has no prior state. + debug!( + self.inner.log, + "No M2P mappings to rehydrate"; + "error" => InlineErrorChain::new(&e), + ); + return; + } + }; + + let mut failed: Vec = Vec::new(); + for mapping in &mappings { + if self.join_underlay_multicast_group(mapping.underlay) { + continue; + } + // Clear the surviving xde M2P entry so `converge_m2p` sees + // the gap on its next pass and re-issues `set_mcast_m2p`, + // which retries the underlay join. Without this, the entry + // stays in xde and convergence treats it as already + // converged, leaving the group black-holed until cycled + // inactive→active. + let clear_req = ClearMcast2Phys { + group: mapping.group, + underlay: mapping.underlay, + }; + if let Err(e) = self.clear_mcast_m2p(&clear_req) { + warn!( + self.inner.log, + "Failed to clear M2P after rehydration join failure, \ + group will stay black-holed until convergence retries"; + "group" => %mapping.group, + "underlay" => %mapping.underlay, + "error" => InlineErrorChain::new(&e), + ); + } + failed.push(mapping.underlay.to_string()); + } + + let total = mappings.len(); + let succeeded = total - failed.len(); + if !mappings.is_empty() { + info!( + self.inner.log, + "Rehydrated underlay multicast filter sockets"; + "succeeded" => succeeded, + "total" => total, + ); + } + if !failed.is_empty() { + warn!( + self.inner.log, + "Some underlay multicast filter sockets failed to \ + rehydrate; M2P entries cleared so convergence will \ + reissue on the next pass"; + "failed_count" => failed.len(), + "total" => total, + "failed_underlay_addrs" => ?failed, + ); + } } pub fn underlay_ip(&self) -> &Ipv6Addr { @@ -371,6 +541,7 @@ impl PortManager { firewall_rules, dhcp_config, attached_subnets: _, + multicast_groups, } = params; let is_service = matches!(nic.kind, NetworkInterfaceKind::Service { .. }); @@ -434,7 +605,7 @@ impl PortManager { .ports .lock() .unwrap() - .insert((nic.id, nic.kind), port.clone()); + .insert((nic.id, nic.kind), PortState::new(port.clone())); assert!( old.is_none(), "Duplicate OPTE port detected: interface_id = {}, kind = {:?}", @@ -553,6 +724,12 @@ impl PortManager { } drop(route_map); + // Configure multicast group subscriptions if any were + // provided at instance start. + if !multicast_groups.is_empty() { + self.multicast_groups_ensure(nic.id, nic.kind, multicast_groups)?; + } + info!( self.inner.log, "Created OPTE port"; @@ -620,13 +797,14 @@ impl PortManager { } // Note: We're deliberately holding both locks here - // to prevent several nexuses computng and applying deltas + // to prevent several nexuses computing and applying deltas // out of order. let ports = self.inner.ports.lock().unwrap(); let hdl = Handle::new()?; // Propagate deltas out to all ports. - for port in ports.values() { + for port_state in ports.values() { + let port = &port_state.port; // Fetch deltas for all router keys: system, IPv4 subnet, and IPv6 // subnet. let system_delta = deltas.get(&port.system_router_key()); @@ -714,11 +892,11 @@ impl PortManager { external_ips: &ExternalIpConfig, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::ExternalIpUpdateMissingPort(nic_id, nic_kind) })?; - self.external_ips_ensure_port(port, nic_id, external_ips) + self.external_ips_ensure_port(&port_state.port, nic_id, external_ips) } /// Ensure external IPs for an OPTE port are up to date. @@ -772,73 +950,478 @@ impl PortManager { Ok(()) } - /// Validate multicast group memberships for an OPTE port. - /// - /// This method validates multicast group configurations but does not yet - /// configure OPTE port-level multicast group membership. The actual - /// multicast forwarding is currently handled by the reconciler + DPD - /// at the dataplane switch level. - /// - /// TODO: Once OPTE kernel module supports multicast group APIs, this - /// method should be updated to configure OPTE port-level multicast - /// group membership. Note: multicast groups are fleet-scoped and can span - /// across VPCs. + /// Ensure multicast group subscriptions for an OPTE port match the + /// requested set. This diffs current vs new state and issues + /// subscribe/unsubscribe ioctls as needed. pub fn multicast_groups_ensure( &self, nic_id: Uuid, nic_kind: NetworkInterfaceKind, multicast_groups: &[MulticastGroupCfg], ) -> Result<(), Error> { - let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { - Error::MulticastUpdateMissingPort(nic_id, nic_kind) - })?; + // Validate and build the new subscription set before acquiring locks. + let mut new_subs: HashMap = HashMap::new(); + for group in multicast_groups { + if !group.group_ip.is_multicast() { + return Err(Error::InvalidPortIpConfig(format!( + "not a multicast address: {}", + group.group_ip, + ))); + } + new_subs + .insert(group.group_ip, multicast_cfg_to_source_filter(group)); + } - debug!( + let hdl = Handle::new()?; + + let mut ports = self.inner.ports.lock().unwrap(); + let port_state = + ports.get_mut(&(nic_id, nic_kind)).ok_or_else(|| { + Error::MulticastUpdateMissingPort(nic_id, nic_kind) + })?; + let port_name = port_state.port.name().to_string(); + + // Unsubscribe groups that are no longer requested. + let to_remove: Vec = port_state + .mcast_subscriptions + .keys() + .filter(|g| !new_subs.contains_key(g)) + .copied() + .collect(); + + let removed = to_remove.len(); + for group_ip in &to_remove { + debug!( + self.inner.log, + "unsubscribing from multicast group"; + "port" => &port_name, + "group" => %group_ip, + ); + + // Effectively infallible, as the IPs are verified as multicast, + // the operation is idempotent, and the port exists. + hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + })?; + + port_state.mcast_subscriptions.remove(group_ip); + } + + // Subscribe to new groups or update changed filters. + let mut added = 0usize; + for (group_ip, filter) in &new_subs { + let needs_subscribe = + match port_state.mcast_subscriptions.get(group_ip) { + None => true, + Some(current) => current != filter, + }; + + if needs_subscribe { + added += 1; + debug!( + self.inner.log, + "subscribing to multicast group"; + "port" => &port_name, + "group" => %group_ip, + "filter" => ?filter, + ); + + // Effectively infallible as the IPs are verified as multicast, + // the operation is idempotent, and the port exists. + hdl.mcast_subscribe(&McastSubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + filter: filter.clone(), + })?; + + port_state + .mcast_subscriptions + .insert(*group_ip, filter.clone()); + } + } + + if added > 0 || removed > 0 { + info!( + self.inner.log, + "multicast subscriptions updated"; + "port" => &port_name, + "added" => added, + "removed" => removed, + "active_groups" => port_state.mcast_subscriptions.len(), + ); + } else { + debug!( + self.inner.log, + "multicast subscriptions reconciled, no change"; + "port" => &port_name, + "active_groups" => port_state.mcast_subscriptions.len(), + ); + } + + Ok(()) + } + + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + /// + /// This setter also programs the underlay NIC multicast MAC filters by + /// joining the underlay IPv6 multicast group on a UDP socket, ensuring the + /// NIC delivers frames to xde. See `mcast_underlay_sockets` docs. + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + + info!( self.inner.log, - "Validating multicast group configuration for OPTE port"; - "port_name" => port.name(), - "nic_id" => ?nic_id, - "groups" => ?multicast_groups, + "Setting multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %addr, ); - // Validate multicast group configurations - for group in multicast_groups { - if !group.group_ip.is_multicast() { - error!( + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.set_m2p(&SetMcast2PhysReq { group: req.group.into(), underlay })?; + + self.join_underlay_multicast_group(addr); + + Ok(()) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + /// + /// Drops the corresponding underlay MAC filter socket, removing the + /// NIC multicast MAC filter entry. + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Clearing multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %addr, + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.clear_m2p(&ClearMcast2PhysReq { + group: req.group.into(), + underlay, + })?; + + self.leave_underlay_multicast_group(addr); + + Ok(()) + } + + /// Join an underlay IPv6 multicast group on all underlay NICs via a + /// UDP socket, programming the NIC's multicast MAC filters. + /// + /// On T6 hardware the NIC drops multicast frames unless the + /// corresponding MAC filter is installed. Joining the group on a + /// socket triggers `mac_multicast_add` in the kernel. The socket + /// receives no data, as xde intercepts first. + /// + /// The cxgbe driver supports [at most 336 multicast filter + /// entries][cxgbe-mcast-limit] per interface. Beyond that, joins + /// will fail and the NIC will not deliver frames for those groups. + /// + /// Failures are logged but not propagated. The M2P mapping in OPTE + /// is the primary requirement, and MAC filter programming is + /// best-effort (e.g., NIC transiently unplumbed at boot, cxgbe + /// multicast filter table exhausted). See [opte#908] for context. + /// + /// [cxgbe-mcast-limit]: https://github.com/oxidecomputer/illumos-gate/blob/c43b3b549678498219f87d7bb5882e9a9a904ade/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c#L759-L765 + /// [opte#908]: https://github.com/oxidecomputer/opte/issues/908 + fn join_underlay_multicast_group(&self, addr: Ipv6Addr) -> bool { + if self.inner.underlay_nics.is_empty() { + return false; + } + + let mut sockets = self.inner.mcast_underlay_sockets.lock().unwrap(); + if sockets.contains_key(&addr) { + return true; + } + + let sock = match UdpSocket::bind("[::]:0") { + Ok(s) => s, + Err(e) => { + warn!( self.inner.log, - "Invalid multicast IP address"; - "group_ip" => %group.group_ip, - "port_name" => port.name(), + "Failed to bind UDP socket for underlay multicast filter"; + "addr" => %addr, + "error" => %e, ); - return Err(Error::InvalidPortIpConfig(String::from( - "invalid multicast IP address", - ))); + return false; } + }; + + // Minimize the receive buffer. This socket exists solely to + // trigger MAC filter programming. xde intercepts packets before + // they reach the socket. The small buffer limits resource waste + // if that invariant is ever violated. + if let Err(e) = sock.set_nonblocking(true) { + warn!( + self.inner.log, + "Failed to set underlay multicast socket non-blocking"; + "addr" => %addr, + "error" => %e, + ); + } + // The kernel may round up from 1 to its own minimum. + let _ = unsafe { + libc::setsockopt( + std::os::unix::io::AsRawFd::as_raw_fd(&sock), + libc::SOL_SOCKET, + libc::SO_RCVBUF, + &1i32 as *const i32 as *const libc::c_void, + std::mem::size_of::() as libc::socklen_t, + ) + }; + + let joined_any = self + .inner + .underlay_nics + .iter() + .filter_map(|nic_name| { + let if_index = nix::net::if_::if_nametoindex(nic_name.as_str()) + .map_err(|e| { + warn!( + self.inner.log, + "Failed to resolve underlay NIC index"; + "nic" => nic_name, + "error" => %e, + ); + }) + .ok()?; + + sock.join_multicast_v6(&addr, if_index) + .map_err(|e| { + warn!( + self.inner.log, + "Failed to join underlay multicast group on NIC"; + "addr" => %addr, + "nic" => nic_name, + "if_index" => if_index, + "error" => %e, + ); + }) + .ok()?; + + debug!( + self.inner.log, + "Joined underlay multicast group on NIC"; + "addr" => %addr, + "nic" => nic_name, + "if_index" => if_index, + ); + Some(()) + }) + .count() + > 0; + + if joined_any { + sockets.insert(addr, sock); + true + } else { + warn!( + self.inner.log, + "no NIC joins succeeded for underlay multicast group, \ + will retry on next call"; + "addr" => %addr, + ); + false + } + } + + /// Drop the UDP socket for an underlay multicast address, removing + /// the NIC MAC filter entries. + fn leave_underlay_multicast_group(&self, addr: Ipv6Addr) { + let mut sockets = self.inner.mcast_underlay_sockets.lock().unwrap(); + if sockets.remove(&addr).is_some() { + debug!( + self.inner.log, + "Removed underlay multicast filter socket"; + "addr" => %addr, + ); } + } - // TODO: Configure firewall rules to allow multicast traffic. - // Add exceptions in source/dest MAC/L3 addr checking for multicast - // addresses matching known groups, only doing cidr-checking on the - // multicasst destination side. + /// Set multicast forwarding next hops for an underlay group address. + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + // Safe to unwrap: 77 is well within the 24-bit VNI range. + let mcast_vni = + Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + let addr: Ipv6Addr = req.underlay; info!( self.inner.log, - "OPTE port configured for multicast traffic"; - "port_name" => port.name(), - "ipv4_range" => %IPV4_MULTICAST_RANGE, - "ipv6_range" => %IPV6_MULTICAST_RANGE, - "multicast_groups" => multicast_groups.len(), + "Setting multicast forwarding"; + "underlay" => %addr, + "next_hops" => req.next_hops.len(), ); - // TODO: Configure OPTE port for specific multicast group membership - // once OPTE kernel module APIs are available. This is distinct from - // zone vNIC underlay configuration (see instance.rs - // `join_multicast_group_inner`). + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let next_hops = req + .next_hops + .iter() + .map(|nexthop| oxide_vpc::api::McastForwardingNextHop { + next_hop: oxide_vpc::api::NextHopV6 { + addr: nexthop.next_hop.into(), + vni: mcast_vni, + }, + replication: match nexthop.replication { + McastReplication::External => { + oxide_vpc::api::Replication::External + } + McastReplication::Underlay => { + oxide_vpc::api::Replication::Underlay + } + McastReplication::Both => oxide_vpc::api::Replication::Both, + }, + source_filter: match nexthop.filter.mode { + McastFilterMode::Include => SourceFilter::Include( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + McastFilterMode::Exclude => SourceFilter::Exclude( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + }, + }) + .collect(); + let hdl = Handle::new()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { underlay, next_hops })?; + Ok(()) + } + + /// Remove all multicast forwarding entries for an underlay group address. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + info!( + self.inner.log, + "Clearing multicast forwarding"; + "underlay" => %addr, + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.clear_mcast_fwd(&ClearMcastForwardingReq { underlay })?; Ok(()) } + /// Dump all multicast overlay-to-underlay (M2P) mappings from OPTE. + pub fn list_mcast_m2p(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_m2p()?; + let mappings = resp + .ip4 + .into_iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: IpAddr::V4(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + }) + .chain(resp.ip6.into_iter().map(|(group, underlay)| { + Mcast2PhysMapping { + group: IpAddr::V6(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + } + })) + .collect(); + Ok(mappings) + } + + /// Dump all multicast forwarding entries from OPTE. + pub fn list_mcast_fwd(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_mcast_fwd()?; + resp.entries + .into_iter() + .map(|entry| { + let next_hops = entry + .next_hops + .into_iter() + .filter_map(|nexthop| { + let replication = match nexthop.replication { + oxide_vpc::api::Replication::External => { + McastReplication::External + } + oxide_vpc::api::Replication::Underlay => { + McastReplication::Underlay + } + oxide_vpc::api::Replication::Both => { + McastReplication::Both + } + oxide_vpc::api::Replication::Reserved => { + // Reserved is a 2-bit padding value with + // no valid semantic meaning. Its presence + // in the forwarding table indicates a bug + // or manual opteadm intervention. Skip + // this hop rather than failing the entire + // list so the reconciler can still program + // valid next-hops. + warn!( + self.inner.log, + "skipping next hop with Reserved \ + replication mode"; + "next_hop" => %nexthop.next_hop.addr + ); + return None; + } + }; + + Some(McastForwardingNextHop { + next_hop: nexthop.next_hop.addr.into(), + replication, + filter: McastSourceFilter { + mode: match nexthop.source_filter.mode() { + FilterMode::Include => { + McastFilterMode::Include + } + FilterMode::Exclude => { + McastFilterMode::Exclude + } + }, + sources: nexthop + .source_filter + .sources() + .iter() + .copied() + .map(Into::into) + .collect(), + }, + }) + }) + .collect(); + + Ok(McastForwardingEntry { + underlay: Ipv6Addr::from(entry.underlay.addr()), + next_hops, + }) + }) + .collect() + } + pub fn firewall_rules_ensure( &self, vni: external::Vni, @@ -856,10 +1439,11 @@ impl PortManager { // We update VPC rules as a set so grab only // the relevant ports using the VPC's VNI. - let vpc_ports = ports - .iter() - .filter(|((_, _), port)| u32::from(vni) == u32::from(*port.vni())); - for ((_, _), port) in vpc_ports { + let vpc_ports = ports.iter().filter(|((_, _), port_state)| { + u32::from(vni) == u32::from(*port_state.port.vni()) + }); + for ((_, _), port_state) in vpc_ports { + let port = &port_state.port; let rules = opte_firewall_rules(rules, port.vni(), port.mac()); let port_name = port.name().to_string(); info!( @@ -969,7 +1553,7 @@ impl PortManager { ensure_added: Vec, ) -> EnsureAttachedSubnetResult { let ports = self.inner.ports.lock().unwrap(); - let Some(port) = ports.get(&(nic_id, nic_kind)) else { + let Some(port_state) = ports.get(&(nic_id, nic_kind)) else { return EnsureAttachedSubnetResult { diff: Default::default(), error: Some(Error::AttachedSubnetUpdateMissingPort( @@ -977,7 +1561,11 @@ impl PortManager { )), }; }; - self.attached_subnets_ensure_port(port, ensure_removed, ensure_added) + self.attached_subnets_ensure_port( + &port_state.port, + ensure_removed, + ensure_added, + ) } fn attached_subnets_ensure_port( @@ -1029,10 +1617,10 @@ impl PortManager { subnet: AttachedSubnet, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::AttachedSubnetUpdateMissingPort(nic_id, nic_kind) })?; - self.attach_subnet_port(port, subnet) + self.attach_subnet_port(&port_state.port, subnet) } fn attach_subnet_port( @@ -1078,10 +1666,10 @@ impl PortManager { subnet: IpCidr, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::AttachedSubnetUpdateMissingPort(nic_id, nic_kind) })?; - self.detach_subnet_port(port, subnet) + self.detach_subnet_port(&port_state.port, subnet) } fn detach_subnet_port( @@ -1154,7 +1742,7 @@ impl PortTicket { fn release_inner(&mut self) -> Result<(), Error> { let mut ports = self.manager.ports.lock().unwrap(); - let Some(port) = ports.remove(&(self.id, self.kind)) else { + let Some(port_state) = ports.remove(&(self.id, self.kind)) else { error!( self.manager.log, "Tried to release non-existent port"; @@ -1163,6 +1751,7 @@ impl PortTicket { ); return Err(Error::ReleaseMissingPort(self.id, self.kind)); }; + let port = &port_state.port; drop(ports); // Cleanup the set of subnets we want to receive routes for. @@ -1199,7 +1788,7 @@ impl PortTicket { "Removed OPTE port from manager"; "id" => ?&self.id, "kind" => ?&self.kind, - "port" => ?&port, + "port" => ?&port_state, ); Ok(()) } @@ -1229,6 +1818,9 @@ impl Drop for PortTicket { mod tests { use super::PortCreateParams; use super::PortManager; + #[cfg(target_os = "illumos")] + use crate::addrobj::AddrObject; + use crate::opte::Error; use crate::opte::Handle; use macaddr::MacAddr6; use omicron_common::api::external::{MacAddr, Vni}; @@ -1242,9 +1834,11 @@ mod tests { use omicron_common::api::internal::shared::RouterVersion; use omicron_test_utils::dev::test_setup_log; use oxide_vpc::api::DhcpCfg; + use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cidr; + use oxide_vpc::api::SourceFilter; use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; @@ -1255,17 +1849,75 @@ mod tests { use sled_agent_types::inventory::NetworkInterfaceKind; use sled_agent_types::inventory::SourceNatConfigV4; use sled_agent_types::inventory::SourceNatConfigV6; + use sled_agent_types::multicast::MulticastGroupCfg; use std::collections::HashSet; + use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; + #[cfg(target_os = "illumos")] + use std::time::Duration; + #[cfg(target_os = "illumos")] + use std::time::Instant; use uuid::Uuid; + // Maximum ephemeral port number for source NAT (14-bit range). + const MAX_PORT: u16 = (1 << 14) - 1; + + /// Loopback interface name on illumos. Tests that verify kernel + /// IPv6 multicast membership are illumos-only because they shell + /// out to illumos's `netstat -g -f inet6`. + #[cfg(target_os = "illumos")] + const LOOPBACK_IF: &str = "lo0"; + + /// Returns `true` iff `netstat -g -f inet6` reports `group` as a + /// membership on `interface`. + /// + /// Used to verify that `join_multicast_v6`/leave on the filter + /// socket actually reached the kernel's IP layer for the named + /// underlay NIC, rather than just updating the in-process + /// `mcast_underlay_sockets` map. + #[cfg(target_os = "illumos")] + fn netstat_v6_has_membership(interface: &str, group: &Ipv6Addr) -> bool { + let out = std::process::Command::new("netstat") + .args(["-g", "-n", "-f", "inet6"]) + .output() + .expect("netstat -g invocation failed"); + let group_str = group.to_string(); + String::from_utf8_lossy(&out.stdout).lines().any(|line| { + let mut fields = line.split_whitespace(); + if let (Some(iface), Some(grp)) = (fields.next(), fields.next()) { + iface == interface && grp == group_str + } else { + false + } + }) + } + + /// Poll `netstat -g` until membership matches `expected`, panicking + /// on timeout. The kernel should update synchronously on the join + /// or leave syscall, but polling absorbs possible transient delay. + #[cfg(target_os = "illumos")] + fn poll_v6_membership(interface: &str, group: &Ipv6Addr, expected: bool) { + let deadline = Instant::now() + Duration::from_secs(5); + while Instant::now() < deadline { + if netstat_v6_has_membership(interface, group) == expected { + return; + } + std::thread::sleep(Duration::from_millis(100)); + } + panic!( + "timeout: membership for {group} on {interface} expected {}", + if expected { "present" } else { "absent" } + ); + } + // Regression for https://github.com/oxidecomputer/omicron/issues/7541. #[test] fn multiple_ports_does_not_destroy_default_route() { let logctx = test_setup_log("multiple_ports_does_not_destroy_default_route"); - let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + let manager = + PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST, &[]); let default_ipv4_route = IpNet::V4(Ipv4Net::new(Ipv4Addr::UNSPECIFIED, 0).unwrap()); let default_ipv6_route = @@ -1310,7 +1962,6 @@ mod tests { }), v6: None, }; - const MAX_PORT: u16 = (1 << 14) - 1; let (port0, _ticket0) = manager .create_port(PortCreateParams { nic: &NetworkInterface { @@ -1335,6 +1986,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1514,6 +2166,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1685,6 +2338,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv4(oxide_vpc::api::Ipv4Cfg { vpc_subnet, @@ -1758,6 +2412,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv6(oxide_vpc::api::Ipv6Cfg { vpc_subnet, @@ -1842,6 +2497,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::DualStack { ipv4, ipv6 } = IpCfg::try_from(&prs).unwrap() else { @@ -1932,6 +2588,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv6 and private IPv4", @@ -1978,9 +2635,453 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv4 and private IPv6", ); } + + #[test] + fn multicast_groups_ensure_diffing() { + let logctx = test_setup_log("multicast_groups_ensure_diffing"); + let manager = + PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST, &[]); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = ExternalIpConfig { + v4: Some(ExternalIpv4Config { + source_nat: Some( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ), + ..Default::default() + }), + v6: None, + }; + + // Bindings keep the port registered in the manager for this scope. + let (_port, _ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.1.1.1".parse().unwrap(); + let group2: IpAddr = "239.1.1.2".parse().unwrap(); + let source_a: IpAddr = "10.0.0.1".parse().unwrap(); + + // Subscribe to two groups: one ASM, one SSM. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[ + MulticastGroupCfg { group_ip: group1, sources: vec![] }, + MulticastGroupCfg { + group_ip: group2, + sources: vec![source_a], + }, + ], + ) + .unwrap(); + + // Verify port manager tracking. + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(port_state.mcast_subscriptions.len(), 2); + assert_eq!( + *port_state.mcast_subscriptions.get(&group1).unwrap(), + SourceFilter::default(), + ); + assert_eq!( + port_state.mcast_subscriptions.get(&group2).unwrap().mode(), + FilterMode::Include, + ); + } + + // Verify mock OPTE state matches. + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 2); + assert!(port.mcast_subscriptions.contains_key(&group1)); + assert!(port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove group2, keep group1. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(port_state.mcast_subscriptions.len(), 1); + assert!(port_state.mcast_subscriptions.contains_key(&group1)); + assert!(!port_state.mcast_subscriptions.contains_key(&group2)); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 1); + assert!(!port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove all groups. + manager.multicast_groups_ensure(nic_id, nic_kind, &[]).unwrap(); + + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert!(port_state.mcast_subscriptions.is_empty()); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert!(port.mcast_subscriptions.is_empty()); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_port_deletion_cleanup() { + let logctx = test_setup_log("multicast_port_deletion_cleanup"); + let manager = + PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST, &[]); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = ExternalIpConfig { + v4: Some(ExternalIpv4Config { + source_nat: Some( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ), + ..Default::default() + }), + v6: None, + }; + + let (_port, ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.2.2.1".parse().unwrap(); + + // Subscribe to a multicast group. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + // Verify subscription tracking exists. + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!( + port_state.mcast_subscriptions.len(), + 1, + "subscription tracking should exist before release" + ); + } + + // Release the port ticket, which should clean up the port + // and its subscription tracking. + ticket.release(); + + // Verify port is removed entirely. + { + let ports = manager.inner.ports.lock().unwrap(); + assert!( + !ports.contains_key(&(nic_id, nic_kind)), + "port should be removed after release" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_ensure_missing_port_error() { + let logctx = test_setup_log("multicast_ensure_missing_port_error"); + let manager = + PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST, &[]); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Instance { id: Uuid::new_v4() }; + let group: IpAddr = "239.3.3.1".parse().unwrap(); + + let res = manager.multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group, sources: vec![] }], + ); + + match res { + Err(Error::MulticastUpdateMissingPort(id, kind)) => { + assert_eq!(id, nic_id); + assert_eq!(kind, nic_kind); + } + other => { + panic!("expected MulticastUpdateMissingPort, got {other:?}") + } + } + + logctx.cleanup_successful(); + } + + /// Verify that `set_mcast_m2p` programs underlay NIC multicast MAC + /// filters via UDP socket join and that `clear_mcast_m2p` removes them. + /// + /// Asserts both the in-process `mcast_underlay_sockets` bookkeeping + /// and kernel-level IPv6 group membership on the underlay interface + /// (observable via `netstat -g -f inet6`). Kernel-level verification + /// is what ensures `join_multicast_v6` actually reached IP and, on + /// actual hardware, would drive `mac_multicast_add` to program the + /// NIC filter. + #[cfg(target_os = "illumos")] + #[test] + fn underlay_multicast_mac_filter_lifecycle() { + let logctx = test_setup_log("underlay_multicast_mac_filter_lifecycle"); + let nics = vec![AddrObject::new_control(LOOPBACK_IF).unwrap()]; + let manager = + PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST, &nics); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + // ff04::1 is within the underlay multicast subnet. + let underlay: Ipv6Addr = "ff04::1".parse().unwrap(); + let group: IpAddr = "239.10.10.1".parse().unwrap(); + + let req = + sled_agent_types::multicast::Mcast2PhysMapping { group, underlay }; + + // Prefligt check: the group must not already be joined on the + // underlay interface. + assert!( + !netstat_v6_has_membership(LOOPBACK_IF, &underlay), + "unexpected pre-existing membership {underlay} on {LOOPBACK_IF}", + ); + + // Set M2P -> socket should be created and kernel should show join. + manager.set_mcast_m2p(&req).unwrap(); + { + let sockets = manager.inner.mcast_underlay_sockets.lock().unwrap(); + assert!( + sockets.contains_key(&underlay), + "Socket should exist after set_mcast_m2p" + ); + } + poll_v6_membership(LOOPBACK_IF, &underlay, true); + + // Setting the same M2P again should be idempotent. + manager.set_mcast_m2p(&req).unwrap(); + { + let sockets = manager.inner.mcast_underlay_sockets.lock().unwrap(); + assert_eq!( + sockets.len(), + 1, + "Duplicate set_mcast_m2p should not create extra sockets" + ); + } + assert!( + netstat_v6_has_membership(LOOPBACK_IF, &underlay), + "membership should still be present after idempotent re-set" + ); + + // Clear M2P -> socket should be removed and kernel membership gone. + let clear_req = + sled_agent_types::multicast::ClearMcast2Phys { group, underlay }; + manager.clear_mcast_m2p(&clear_req).unwrap(); + { + let sockets = manager.inner.mcast_underlay_sockets.lock().unwrap(); + assert!( + !sockets.contains_key(&underlay), + "Socket should be removed after clear_mcast_m2p" + ); + } + poll_v6_membership(LOOPBACK_IF, &underlay, false); + + logctx.cleanup_successful(); + } + + /// Verify that rehydration at startup reopens filter sockets for + /// M2P mappings that survived in mock xde state across a + /// PortManager drop (simulating sled-agent restart). + #[cfg(target_os = "illumos")] + #[test] + fn underlay_multicast_mac_filter_rehydration() { + let logctx = + test_setup_log("underlay_multicast_mac_filter_rehydration"); + let nics = vec![AddrObject::new_control(LOOPBACK_IF).unwrap()]; + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + // Use a distinct underlay address to avoid collisions with + // other tests sharing the static OPTE_STATE. + let underlay: Ipv6Addr = "ff04::99".parse().unwrap(); + let group: IpAddr = "239.10.10.99".parse().unwrap(); + + let req = + sled_agent_types::multicast::Mcast2PhysMapping { group, underlay }; + + // Phase 1: first PortManager sets M2P (populates mock xde state). + { + let mgr1 = PortManager::new( + logctx.log.clone(), + Ipv6Addr::LOCALHOST, + &nics, + ); + mgr1.set_mcast_m2p(&req).unwrap(); + { + let sockets = mgr1.inner.mcast_underlay_sockets.lock().unwrap(); + assert!(sockets.contains_key(&underlay)); + } + poll_v6_membership(LOOPBACK_IF, &underlay, true); + } + + // mgr1 dropped: socket closed, kernel membership removed. + poll_v6_membership(LOOPBACK_IF, &underlay, false); + + // Mock xde state (static) still has the M2P entry, simulating + // xde kernel state surviving a sled-agent restart. + { + let hdl = Handle::new().unwrap(); + let dump = hdl.dump_m2p().unwrap(); + assert!( + !dump.ip4.is_empty() || !dump.ip6.is_empty(), + "Mock xde should still hold the M2P mapping after drop" + ); + } + + // Phase 2: new PortManager rehydrates from surviving xde state. + let mgr2 = + PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST, &nics); + { + let sockets = mgr2.inner.mcast_underlay_sockets.lock().unwrap(); + assert!( + sockets.contains_key(&underlay), + "Rehydration should reopen socket for surviving M2P" + ); + } + poll_v6_membership(LOOPBACK_IF, &underlay, true); + + // Cleanup and clear the M2P. + let clear_req = + sled_agent_types::multicast::ClearMcast2Phys { group, underlay }; + mgr2.clear_mcast_m2p(&clear_req).unwrap(); + poll_v6_membership(LOOPBACK_IF, &underlay, false); + + logctx.cleanup_successful(); + } + + /// Verify that no sockets are created when no underlay NICs are + /// configured (test/sim mode). + #[test] + fn underlay_multicast_mac_filter_no_nics() { + let logctx = test_setup_log("underlay_multicast_mac_filter_no_nics"); + let manager = + PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST, &[]); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let underlay: Ipv6Addr = "ff04::2".parse().unwrap(); + let group: IpAddr = "239.10.10.2".parse().unwrap(); + + let req = + sled_agent_types::multicast::Mcast2PhysMapping { group, underlay }; + + manager.set_mcast_m2p(&req).unwrap(); + { + let sockets = manager.inner.mcast_underlay_sockets.lock().unwrap(); + assert!( + sockets.is_empty(), + "No sockets should be created without underlay NICs" + ); + } + + logctx.cleanup_successful(); + } } diff --git a/internal-dns/resolver/src/resolver.rs b/internal-dns/resolver/src/resolver.rs index 9ce3d6aa48d..a69e48b1cb3 100644 --- a/internal-dns/resolver/src/resolver.rs +++ b/internal-dns/resolver/src/resolver.rs @@ -345,6 +345,75 @@ impl Resolver { } } + /// Returns the SRV targets paired with their resolved IPv6 sockets. + /// + /// Like [`Resolver::lookup_all_socket_v6`], but preserves the SRV + /// target name so callers can correlate sockets back to their + /// source. Per-target IPv6 lookups are best-effort: failures for a + /// given target are logged and the entry is dropped from the result. + pub async fn lookup_all_socket_v6_by_target( + &self, + service: ServiceName, + ) -> Result, ResolveError> { + let name = service.srv_name(); + trace!(self.log, "lookup_all_socket_v6_by_target srv"; "dns_name" => &name); + let response = self.resolver.srv_lookup(&name).await?; + debug!( + self.log, + "lookup_all_socket_v6_by_target srv"; + "dns_name" => &name, + "response" => ?response + ); + + let futs = std::iter::repeat((self.log.clone(), self.resolver.clone())) + .zip(response.into_iter()) + .map(|((log, resolver), srv)| async move { + let target = srv.target().to_string(); + let port = srv.port(); + trace!( + log, + "lookup_all_socket_v6_by_target: looking up SRV target"; + "name" => &target, + ); + resolver + .ipv6_lookup(target.clone()) + .await + .map(|ips| (target.clone(), ips, port)) + .map_err(|err| (target, err)) + }); + + let log = self.log.clone(); + let pairs: Vec<(String, SocketAddrV6)> = futures::future::join_all( + futs, + ) + .await + .into_iter() + .flat_map(move |res| match res { + Ok((target, ips, port)) => ips + .into_iter() + .map(|ip| { + (target.clone(), SocketAddrV6::new(ip.into(), port, 0, 0)) + }) + .collect::>(), + Err((target, err)) => { + error!( + log, + "lookup_all_socket_v6_by_target: failed looking up target"; + "name" => %target, + "error" => ?err, + ); + Vec::new() + } + }) + .collect(); + + if pairs.is_empty() { + Err(ResolveError::NotFound(service)) + } else { + Ok(pairs) + } + } + // Returns an iterator of SocketAddrs for the specified SRV name. // // Acts on a raw string for compatibility with the reqwest::dns::Resolve diff --git a/internal-dns/types/src/config.rs b/internal-dns/types/src/config.rs index d5bef144343..f9d51051366 100644 --- a/internal-dns/types/src/config.rs +++ b/internal-dns/types/src/config.rs @@ -399,6 +399,7 @@ impl DnsConfigBuilder { dendrite_port: u16, mgs_port: u16, mgd_port: u16, + ddm_port: u16, ) -> anyhow::Result<()> { let zone = self.host_dendrite(sled_id, switch_zone_ip)?; self.service_backend_zone(ServiceName::Dendrite, &zone, dendrite_port)?; @@ -407,7 +408,8 @@ impl DnsConfigBuilder { &zone, mgs_port, )?; - self.service_backend_zone(ServiceName::Mgd, &zone, mgd_port) + self.service_backend_zone(ServiceName::Mgd, &zone, mgd_port)?; + self.service_backend_zone(ServiceName::Ddm, &zone, ddm_port) } /// Higher-level shorthand for adding a Nexus zone with both its internal @@ -779,6 +781,8 @@ mod test { "_oximeter-reader._tcp", ); assert_eq!(ServiceName::Dendrite.dns_name(), "_dendrite._tcp",); + assert_eq!(ServiceName::Mgd.dns_name(), "_mgd._tcp",); + assert_eq!(ServiceName::Ddm.dns_name(), "_ddm._tcp",); assert_eq!( ServiceName::CruciblePantry.dns_name(), "_crucible-pantry._tcp", @@ -796,6 +800,33 @@ mod test { ); } + #[test] + fn host_zone_switch_publishes_all_services() { + let sled_uuid: SledUuid = + "001de000-51ed-4000-8000-000000000001".parse().unwrap(); + let switch_zone_ip = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1); + + let mut b = DnsConfigBuilder::new(); + b.host_zone_switch(sled_uuid, switch_zone_ip, 1, 2, 3, 4).unwrap(); + let config = b.build_full_config_for_initial_generation(); + + let services: std::collections::BTreeSet<_> = config + .zones + .iter() + .flat_map(|z| z.records.iter()) + .map(|(name, _)| name.as_str()) + .collect(); + for expected in + ["_dendrite._tcp", "_mgs._tcp", "_mgd._tcp", "_ddm._tcp"] + { + assert!( + services.contains(expected), + "expected {expected} in published switch-zone services; \ + got {services:?}" + ); + } + } + #[test] fn display_hosts() { let sled_uuid = SledUuid::nil(); diff --git a/internal-dns/types/src/names.rs b/internal-dns/types/src/names.rs index 73b2439e48e..105d0222f3c 100644 --- a/internal-dns/types/src/names.rs +++ b/internal-dns/types/src/names.rs @@ -75,6 +75,7 @@ pub enum ServiceName { BoundaryNtp, InternalNtp, Mgd, + Ddm, } impl ServiceName { @@ -116,6 +117,7 @@ impl ServiceName { ServiceName::BoundaryNtp => "boundary-ntp", ServiceName::InternalNtp => "internal-ntp", ServiceName::Mgd => "mgd", + ServiceName::Ddm => "ddm", } } @@ -144,7 +146,8 @@ impl ServiceName { | ServiceName::CruciblePantry | ServiceName::BoundaryNtp | ServiceName::InternalNtp - | ServiceName::Mgd => { + | ServiceName::Mgd + | ServiceName::Ddm => { format!("_{}._tcp", self.service_kind()) } ServiceName::SledAgent(id) => { diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index c877645a239..e3fb881f972 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -923,18 +923,6 @@ pub struct MulticastGroupReconcilerConfig { #[serde_as(as = "DurationSeconds")] pub period_secs: Duration, - /// TTL (in seconds) for the sled-to-switch-port mapping cache. - /// - /// This cache maps sled IDs to their physical switch ports. It changes when - /// sleds are added/removed or inventory is updated. - /// - /// Default: 3600 seconds (1 hour) - #[serde( - default = "MulticastGroupReconcilerConfig::default_sled_cache_ttl_secs" - )] - #[serde_as(as = "DurationSeconds")] - pub sled_cache_ttl_secs: Duration, - /// TTL (in seconds) for the backplane hardware topology cache. /// /// This cache stores the hardware platform's port mapping. It effectively @@ -949,10 +937,6 @@ pub struct MulticastGroupReconcilerConfig { } impl MulticastGroupReconcilerConfig { - const fn default_sled_cache_ttl_secs() -> Duration { - Duration::from_secs(3600) // 1 hour - } - const fn default_backplane_cache_ttl_secs() -> Duration { Duration::from_secs(86400) // 24 hours } @@ -962,7 +946,6 @@ impl Default for MulticastGroupReconcilerConfig { fn default() -> Self { Self { period_secs: Duration::from_secs(60), - sled_cache_ttl_secs: Self::default_sled_cache_ttl_secs(), backplane_cache_ttl_secs: Self::default_backplane_cache_ttl_secs(), } } @@ -1585,7 +1568,6 @@ mod test { }, multicast_reconciler: MulticastGroupReconcilerConfig { period_secs: Duration::from_secs(60), - sled_cache_ttl_secs: MulticastGroupReconcilerConfig::default_sled_cache_ttl_secs(), backplane_cache_ttl_secs: MulticastGroupReconcilerConfig::default_backplane_cache_ttl_secs(), }, trust_quorum: TrustQuorumConfig { diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index ba6a93b22bf..688dded79c1 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -150,6 +150,7 @@ nexus-reconfigurator-rendezvous.workspace = true nexus-types.workspace = true nexus-types-versions.workspace = true omicron-common.workspace = true +omicron-ddm-admin-client.workspace = true omicron-passwords.workspace = true oxide-tokio-rt.workspace = true oximeter.workspace = true diff --git a/nexus/db-queries/src/db/datastore/multicast/groups.rs b/nexus/db-queries/src/db/datastore/multicast/groups.rs index 0fb1b6e1e2b..4f78463b849 100644 --- a/nexus/db-queries/src/db/datastore/multicast/groups.rs +++ b/nexus/db-queries/src/db/datastore/multicast/groups.rs @@ -408,7 +408,7 @@ impl DataStore { use nexus_db_schema::schema::multicast_group_member; let now = Utc::now(); - // Atomic: only mark `Deleting` if no active members exist. + // Atomically mark "Deleting" only if no active members exist. let rows = diesel::update(multicast_group::table) .filter(multicast_group::id.eq(group_id.into_untyped_uuid())) .filter( diff --git a/nexus/db-queries/src/db/datastore/multicast/members.rs b/nexus/db-queries/src/db/datastore/multicast/members.rs index 1c2d25a703b..e95e924a6cf 100644 --- a/nexus/db-queries/src/db/datastore/multicast/members.rs +++ b/nexus/db-queries/src/db/datastore/multicast/members.rs @@ -2285,7 +2285,7 @@ mod tests { assert_eq!(unchanged_member.state, MulticastGroupMemberState::Joined); assert_eq!(unchanged_member.time_modified, before_modification); - // Test starting instance that has no multicast memberships (should be no-op) + // Test starting instance that has no multicast memberships (should be noop) let non_member_instance = InstanceUuid::new_v4(); datastore .multicast_group_member_set_instance_sled( @@ -2450,7 +2450,7 @@ mod tests { .await .expect("Should handle duplicate mark for removal"); - // Test marking instance with no memberships (should be no-op) + // Test marking instance with no memberships (should be noop) let non_member_instance = InstanceUuid::new_v4(); datastore .multicast_group_members_mark_for_removal( @@ -2668,7 +2668,7 @@ mod tests { .expect("Should list group2 members"); assert_eq!(group2_members.len(), 2); - // Test deleting from group with no members (should be no-op) + // Test deleting from group with no members (should be noop) datastore .multicast_group_members_delete_by_group( &opctx, @@ -2677,7 +2677,7 @@ mod tests { .await .expect("Should handle deleting from empty group"); - // Test deleting from nonexistent group (should be no-op) + // Test deleting from nonexistent group (should be noop) let fake_group_id = Uuid::new_v4(); datastore .multicast_group_members_delete_by_group( diff --git a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs index 254a2485bd7..69474ac8055 100644 --- a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs +++ b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs @@ -137,7 +137,7 @@ impl From for external::Error { /// - **Reactivate**: Member in "Left" (time_deleted=NULL) → transition to /// "Joining", update `sled_id` /// - **Insert new**: Member in "Left" (time_deleted set) → create new row -/// - **Idempotent**: Member already "Joining" or "Joined" → no-op +/// - **Idempotent**: Member already "Joining" or "Joined" → noop /// /// Atomically validates group and instance exist, retrieves instance's current /// sled_id, and performs member upsert. Returns member ID. diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 3c1a1a3700a..11c01c98f1f 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -187,9 +187,6 @@ fm.sitrep_gc_period_secs = 600 fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 -# TTL for sled-to-backplane-port mapping cache -# Default: 3600 seconds (1 hour) - detects new sleds and inventory changes -# multicast_reconciler.sled_cache_ttl_secs = 3600 # TTL for backplane topology cache (static platform configuration) # Default: 86400 seconds (24 hours) - refreshed on-demand when validation fails # multicast_reconciler.backplane_cache_ttl_secs = 86400 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index b4026bfb1de..530b4c67f59 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -171,9 +171,6 @@ fm.sitrep_gc_period_secs = 600 fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 -# TTL for sled-to-backplane-port mapping cache -# Default: 3600 seconds (1 hour) - detects new sleds and inventory changes -# multicast_reconciler.sled_cache_ttl_secs = 3600 # TTL for backplane topology cache (static platform configuration) # Default: 86400 seconds (24 hours) - refreshed on-demand when validation fails # multicast_reconciler.backplane_cache_ttl_secs = 86400 diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index f228f68d961..bce88c3eb32 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -224,7 +224,8 @@ mod api_impl { use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::InstanceEnsureBody; use sled_agent_types::instance::InstanceExternalIpBody; - use sled_agent_types::instance::InstanceMulticastBody; + use sled_agent_types::instance::InstanceMulticastMembership; + use sled_agent_types::instance::InstancePathParam; use sled_agent_types::instance::SledVmmState; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestBody; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestPathParam; @@ -249,6 +250,10 @@ mod api_impl { use sled_agent_types::inventory::SledCpuFamily; use sled_agent_types::inventory::SledRole; use sled_agent_types::inventory::SvcsEnabledNotOnlineResult; + use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, + }; use sled_agent_types::probes::ProbeSet; use sled_agent_types::sled::AddSledRequest; use sled_agent_types::support_bundle::RangeRequestHeaders; @@ -268,6 +273,7 @@ mod api_impl { use sled_agent_types::zone_bundle::ZoneBundleMetadata; use sled_agent_types::zone_bundle::ZonePathParam; use sled_agent_types_versions::v1; + use sled_agent_types_versions::v7; use sled_agent_types_versions::v20; use sled_agent_types_versions::v25; use sled_agent_types_versions::v26; @@ -625,48 +631,36 @@ mod api_impl { unimplemented!() } - async fn vmm_join_multicast_group( + async fn instance_join_multicast_group( + _rqctx: RequestContext, + _path_params: Path, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn instance_leave_multicast_group( + _rqctx: RequestContext, + _path_params: Path, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn vmm_join_multicast_group_v7( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Join(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations" - .to_string(), - )) - } - } - } - - async fn vmm_leave_multicast_group( + unimplemented!() + } + + async fn vmm_leave_multicast_group_v7( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Leave(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations" - .to_string(), - )) - } - } + unimplemented!() } async fn disk_put( @@ -757,6 +751,47 @@ mod api_impl { unimplemented!() } + async fn set_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn set_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn list_mcast_m2p( + _rqctx: RequestContext, + ) -> Result>, HttpError> { + unimplemented!() + } + + async fn list_mcast_fwd( + _rqctx: RequestContext, + ) -> Result>, HttpError> + { + unimplemented!() + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/nexus/reconfigurator/execution/src/test_utils.rs b/nexus/reconfigurator/execution/src/test_utils.rs index cd46adacd0b..fdb17289225 100644 --- a/nexus/reconfigurator/execution/src/test_utils.rs +++ b/nexus/reconfigurator/execution/src/test_utils.rs @@ -113,10 +113,12 @@ pub fn overridables_for_test( let dendrite_port = cptestctx.dendrite.read().unwrap().get(&switch_slot).unwrap().port; let mgd_port = cptestctx.mgd.get(&switch_slot).unwrap().port; + let ddm_port = cptestctx.ddm.get(&switch_slot).unwrap().port; overrides.override_switch_zone_ip(sled_id, ip); overrides.override_dendrite_port(sled_id, dendrite_port); overrides.override_mgs_port(sled_id, mgs_port); overrides.override_mgd_port(sled_id, mgd_port); + overrides.override_ddm_port(sled_id, ddm_port); } overrides } diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index d2f5b129258..3f0a3c1e8fa 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -1844,7 +1844,8 @@ mod tests { | ServiceName::RepoDepot | ServiceName::ManagementGatewayService | ServiceName::Dendrite - | ServiceName::Mgd => { + | ServiceName::Mgd + | ServiceName::Ddm => { out.insert(service, Ok(())); } // InternalNtp is too large to fit in a single DNS packet and diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 5f7e9a07da8..58228b81123 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -1095,12 +1095,14 @@ impl BackgroundTasksInitializer { datastore.clone(), resolver.clone(), sagas.clone(), - inventory_load_watcher.clone(), args.multicast_enabled, - config.multicast_reconciler.sled_cache_ttl_secs, config.multicast_reconciler.backplane_cache_ttl_secs, )), opctx: opctx.child(BTreeMap::new()), + // Wake the reconciler whenever the inventory loader publishes a + // fresh collection so newly-discovered sleds become resolvable + // (DDM-peer fallback / inventory mapping) within the same tick + // instead of waiting for the periodic timer. watchers: vec![Box::new(inventory_load_watcher.clone())], activator: task_multicast_reconciler, }); diff --git a/nexus/src/app/background/tasks/multicast/groups.rs b/nexus/src/app/background/tasks/multicast/groups.rs index db2c51938a5..a8be531d92a 100644 --- a/nexus/src/app/background/tasks/multicast/groups.rs +++ b/nexus/src/app/background/tasks/multicast/groups.rs @@ -19,7 +19,13 @@ //! ## Operations Handled //! - **"Creating" state**: Initiate DPD "ensure" to apply configuration //! - **"Active" state**: Detect DPD drift and sync directly -//! - **"Deleting" state**: Switch cleanup and database removal +//! - **MRIB programming**: For Active groups, reconcile switch MRIB +//! routes against a per-pass snapshot (see [`super::mrib`]) +//! - **"Deleting" state**: Switch cleanup, MRIB route withdrawal, and +//! database removal +//! - **M2P/forwarding propagation**: Convergent per-sled propagation of +//! M2P mappings and forwarding entries via sled-agent after member +//! state changes //! - **Extensible processing**: Support for different group types //! //! # Group State Transition Matrix @@ -75,9 +81,11 @@ use anyhow::Context; use chrono::Utc; +use futures::future::try_join_all; use futures::stream::{self, StreamExt}; use slog::{debug, error, info, trace, warn}; +use dpd_client::types::IpSrc; use nexus_db_model::{MulticastGroup, MulticastGroupState, SqlU8}; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::multicast::EnsureUnderlayResult; @@ -87,12 +95,15 @@ use omicron_common::address::is_ssm_address; use omicron_common::api::external::{self, DataPageParams}; use omicron_uuid_kinds::{GenericUuid, MulticastGroupUuid}; -use super::{ - MulticastGroupReconciler, StateTransition, map_external_to_underlay_ip, -}; +use super::{MulticastGroupReconciler, StateTransition}; use crate::app::multicast::dataplane::{ GroupUpdateParams, MulticastDataplaneClient, }; +use crate::app::multicast::map_external_to_underlay_ip; +use crate::app::multicast::sled::MulticastSledClient; +use crate::app::multicast::switch_zone::{ + MribRouteIndex, MulticastSwitchZoneClient, +}; use crate::app::saga::create_saga_dag; use crate::app::sagas; @@ -100,7 +111,7 @@ use crate::app::sagas; /// /// This grace period avoids racing with in-progress member attachment operations /// that occur immediately after group creation. -const ORPHAN_GROUP_MIN_AGE: chrono::Duration = chrono::Duration::seconds(10); +const ORPHAN_GROUP_MIN_AGE: chrono::TimeDelta = chrono::TimeDelta::seconds(10); /// Check if DPD tag matches the database group's tag. /// @@ -130,39 +141,59 @@ fn dpd_state_matches_sources( let dpd_sources = dpd_group.sources.clone(); let group_ip = group.multicast_ip.ip(); - // Expected DPD state based on source filter logic (RFC 4607) - let expected_sources = if is_ssm_address(group_ip) { - Some(&source_filter.specific_sources) + if is_ssm_address(group_ip) { + // SSM: always expect specific sources + match dpd_sources { + None => false, + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } + } } else if source_filter.has_any_source_member { - None + dpd_sources.is_none() } else { - Some(&source_filter.specific_sources) - }; - - match (dpd_sources, expected_sources) { - (None, None) => true, - (Some(_), None) => false, // DPD has sources but shouldn't - (None, Some(_)) => false, // DPD missing sources - (Some(dpd_srcs), Some(expected)) => { - // Extract exact IPs from DPD sources - let mut dpd_ips: Vec<_> = dpd_srcs - .into_iter() - .filter_map(|src| match src { - dpd_client::types::IpSrc::Exact(ip) => Some(ip), - _ => None, - }) - .collect(); - dpd_ips.sort(); - - let mut expected_sorted: Vec<_> = - expected.iter().copied().collect(); - expected_sorted.sort(); - - dpd_ips == expected_sorted + match dpd_sources { + None => source_filter.specific_sources.is_empty(), + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } } } } +/// Switch-side clients threaded through group state processors. +struct GroupReconcileClients<'a> { + dataplane: &'a MulticastDataplaneClient, + sled: &'a MulticastSledClient, + switch_zone: &'a MulticastSwitchZoneClient, +} + /// Trait for processing different types of multicast groups trait GroupStateProcessor { /// Process a group in "Creating" state. @@ -179,7 +210,7 @@ trait GroupStateProcessor { reconciler: &MulticastGroupReconciler, opctx: &OpContext, group: &MulticastGroup, - dataplane_client: &MulticastDataplaneClient, + clients: &GroupReconcileClients<'_>, ) -> Result; /// Process a group in "Active" state (check DPD sync status). @@ -188,7 +219,8 @@ trait GroupStateProcessor { reconciler: &MulticastGroupReconciler, opctx: &OpContext, group: &MulticastGroup, - dataplane_client: &MulticastDataplaneClient, + clients: &GroupReconcileClients<'_>, + mrib_route_index: Option<&MribRouteIndex>, ) -> Result; } @@ -212,23 +244,36 @@ impl GroupStateProcessor for ExternalGroupProcessor { reconciler: &MulticastGroupReconciler, opctx: &OpContext, group: &MulticastGroup, - dataplane_client: &MulticastDataplaneClient, + clients: &GroupReconcileClients<'_>, ) -> Result { reconciler - .handle_deleting_external_group(opctx, group, dataplane_client) + .handle_deleting_external_group( + opctx, + group, + clients.dataplane, + clients.sled, + clients.switch_zone, + ) .await } - /// Handle groups in "Active" state (check DPD sync status). async fn process_active( &self, reconciler: &MulticastGroupReconciler, opctx: &OpContext, group: &MulticastGroup, - dataplane_client: &MulticastDataplaneClient, + clients: &GroupReconcileClients<'_>, + mrib_route_index: Option<&MribRouteIndex>, ) -> Result { reconciler - .handle_active_external_group(opctx, group, dataplane_client) + .handle_active_external_group( + opctx, + group, + clients.dataplane, + clients.sled, + clients.switch_zone, + mrib_route_index, + ) .await } } @@ -336,6 +381,8 @@ impl MulticastGroupReconciler { opctx: &OpContext, state: MulticastGroupState, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, + switch_zone_client: Option<&MulticastSwitchZoneClient>, ) -> Result { trace!(opctx.log, "searching for multicast groups"; "state" => %state); @@ -359,11 +406,34 @@ impl MulticastGroupReconciler { trace!(opctx.log, "found multicast groups"; "count" => groups.len(), "state" => %state); + let mrib_route_index = match (state, switch_zone_client) { + (MulticastGroupState::Active, Some(client)) => client + .list_routes_indexed() + .await + .inspect_err(|e| { + warn!( + opctx.log, + "failed to build per-pass MRIB route snapshot"; + "error" => %e, + ) + }) + .ok(), + _ => None, + }; + let mrib_route_index = mrib_route_index.as_ref(); + // Process groups concurrently with configurable parallelism - let results = stream::iter(groups) + let group_outcomes = stream::iter(groups) .map(|group| async move { let result = self - .process_group_state(opctx, &group, dataplane_client) + .process_group_state( + opctx, + &group, + dataplane_client, + sled_client, + switch_zone_client, + mrib_route_index, + ) .await; (group, result) }) @@ -373,8 +443,8 @@ impl MulticastGroupReconciler { // Handle results with state-appropriate logging and counting let mut processed = 0; - let total_results = results.len(); - for (group, result) in results { + let total = group_outcomes.len(); + for (group, result) in group_outcomes { match result { Ok(transition) => { // Count successful transitions based on state expectations @@ -404,7 +474,7 @@ impl MulticastGroupReconciler { processed += 1; } - debug!( + trace!( opctx.log, "processed multicast group"; "state" => %state, @@ -424,13 +494,13 @@ impl MulticastGroupReconciler { } } - if total_results > 0 { + if total > 0 { debug!( opctx.log, "group reconciliation completed"; "state" => %state, "processed" => processed, - "total" => total_results + "total" => total ); } @@ -446,6 +516,8 @@ impl MulticastGroupReconciler { opctx, MulticastGroupState::Creating, None, + None, + None, ) .await } @@ -455,11 +527,15 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, + switch_zone_client: &MulticastSwitchZoneClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Deleting, Some(dataplane_client), + Some(sled_client), + Some(switch_zone_client), ) .await } @@ -469,11 +545,15 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, + switch_zone_client: &MulticastSwitchZoneClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Active, Some(dataplane_client), + Some(sled_client), + Some(switch_zone_client), ) .await } @@ -485,6 +565,9 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, + switch_zone_client: Option<&MulticastSwitchZoneClient>, + mrib_route_index: Option<&MribRouteIndex>, ) -> Result { // Future: Match on group type to select different processors if // we add more nuanced group types @@ -495,17 +578,37 @@ impl MulticastGroupReconciler { processor.process_creating(self, opctx, group).await } MulticastGroupState::Deleting => { - let dataplane_client = dataplane_client - .context("dataplane client required for deleting state")?; - processor - .process_deleting(self, opctx, group, dataplane_client) - .await + let clients = GroupReconcileClients { + dataplane: dataplane_client.context( + "dataplane client required for deleting state", + )?, + sled: sled_client + .context("sled client required for deleting state")?, + switch_zone: switch_zone_client.context( + "switch zone client required for deleting state", + )?, + }; + processor.process_deleting(self, opctx, group, &clients).await } MulticastGroupState::Active => { - let dataplane_client = dataplane_client - .context("dataplane client required for active state")?; + let clients = GroupReconcileClients { + dataplane: dataplane_client.context( + "dataplane client required for active state", + )?, + sled: sled_client + .context("sled client required for active state")?, + switch_zone: switch_zone_client.context( + "switch zone client required for active state", + )?, + }; processor - .process_active(self, opctx, group, dataplane_client) + .process_active( + self, + opctx, + group, + &clients, + mrib_route_index, + ) .await } MulticastGroupState::Deleted => { @@ -602,7 +705,7 @@ impl MulticastGroupReconciler { // `backplane_map` validation for rear ports). These uplink members use // `Direction::External` and follow a different lifecycle - added when // first instance joins, removed when last instance leaves. - // Should integrate with `switch_ports_with_uplinks()` or + // Should integrate with `switch_ports_with_uplinks` or // equivalent front port discovery mechanism, which would be // configurable, and later learned (i.e., via `mcastd`/IGMP). @@ -623,6 +726,8 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, + switch_zone_client: &MulticastSwitchZoneClient, ) -> Result { debug!( opctx.log, @@ -635,8 +740,53 @@ impl MulticastGroupReconciler { "dpd_cleanup_required" => true ); - self.process_deleting_group_inner(opctx, group, dataplane_client) + // Remove MRIB routes so `mg-lower` withdraws DDM advertisements + // before cleaning up DPD and DB state. Bail on failure so the + // next pass can retry. Proceeding would delete DB rows and + // leave stale DDM advertisements. + let group_ip = group.multicast_ip.ip(); + let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); + + // Remove (*,G) route. + switch_zone_client + .remove_route(group_ip, None) + .await + .context("failed to remove MRIB (*,G) route for deleting group")?; + + // Remove (S,G) routes for any sources. Bail on failure + // to preserve DB state for retry on the next pass. + let source_filter = self + .datastore + .multicast_groups_source_filter_state(opctx, &[group_id]) + .await + .context( + "failed to load source filter for MRIB cleanup; \ + bailing to preserve DB state for retry", + )?; + + if let Some(filter) = source_filter.get(&group.id()) { + // Per-source removals target distinct (S,G) keys. We fan out so + // a group with N sources doesn't pay N round-trips serially. + try_join_all(filter.specific_sources.iter().map( + |source| async move { + switch_zone_client + .remove_route(group_ip, Some(*source)) + .await + .with_context(|| format!( + "failed to remove MRIB (S,G) route for source {source}" + )) + }), + ) .await?; + } + + self.process_deleting_group_inner( + opctx, + group, + dataplane_client, + sled_client, + ) + .await?; Ok(StateTransition::StateChanged) } @@ -649,6 +799,9 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, + switch_zone_client: &MulticastSwitchZoneClient, + mrib_route_index: Option<&MribRouteIndex>, ) -> Result { let underlay_group_id = group .underlay_group_id @@ -712,7 +865,7 @@ impl MulticastGroupReconciler { } }; - if needs_update { + let res = if needs_update { debug!( opctx.log, "updating active multicast group in DPD"; @@ -747,6 +900,22 @@ impl MulticastGroupReconciler { "group_id" => %group.id(), "multicast_ip" => %group.multicast_ip ); + + // Propagate M2P/forwarding to member sleds after DPD + // sync to ensure OPTE state is also consistent. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + drift correction (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::StateChanged) } Err(e) => { @@ -761,8 +930,37 @@ impl MulticastGroupReconciler { } } } else { + // Even when DPD is in sync, propagate M2P/forwarding to + // member sleds to correct any sled-level drift. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::NoChange) - } + }; + + // Reconcile MRIB routes based on whether the group has active + // ("Joined") members. If all members are "Left", withdraw the DDM + // advertisement so peer sleds stop sending traffic. + super::mrib::reconcile_group( + opctx, + &self.datastore, + switch_zone_client, + mrib_route_index, + group, + &source_filter, + underlay_group_id, + ) + .await; + + res } /// Process a single multicast group in "Creating" state. @@ -772,7 +970,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, ) -> Result { - debug!( + trace!( opctx.log, "processing creating multicast group"; "group" => ?group @@ -789,7 +987,7 @@ impl MulticastGroupReconciler { format!("failed to fetch linked underlay group {underlay_id}") })?; - debug!( + trace!( opctx.log, "found linked underlay group"; "group" => ?group, @@ -798,12 +996,12 @@ impl MulticastGroupReconciler { underlay } None => { - debug!( + trace!( opctx.log, "creating new underlay group"; "group" => ?group ); - match self.ensure_underlay_for_external(opctx, &group).await? { + match self.ensure_underlay_for_external(opctx, group).await? { Some(underlay) => underlay, None => return Ok(false), // Group deleted during processing } @@ -835,9 +1033,9 @@ impl MulticastGroupReconciler { >(saga_params) .context("failed to create multicast group transaction saga")?; - let saga_id = self + let (saga_id, completion) = self .sagas - .saga_start(dag) + .saga_run(dag) .await .context("failed to start multicast group transaction saga")?; @@ -851,6 +1049,11 @@ impl MulticastGroupReconciler { "expected_outcome" => "Creating → Active" ); + // Block this pass on saga completion so subsequent reconciler + // steps observe "Active" within the same pass. See module-level + // "RPW Saga Coordination" for rationale. + completion.await.context("multicast group transaction saga failed")?; + Ok(true) } @@ -860,6 +1063,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { let tag = Self::get_multicast_tag(group) .context("multicast group missing tag")?; @@ -875,6 +1079,15 @@ impl MulticastGroupReconciler { "cleanup_includes" => "[external_group, underlay_group, forwarding_rules, member_ports]" ); + // Clear M2P/forwarding from all sleds before DPD cleanup. + // This must succeed before deleting DB records, otherwise + // stale OPTE state would persist on sleds where the clear + // failed, with no DB record to drive a retry on a later pass. + sled_client + .clear_m2p_and_forwarding(opctx, group) + .await + .context("failed to clear M2P/forwarding from sleds")?; + // Use dataplane client from reconciliation pass to cleanup switch(es) // state by tag dataplane_client @@ -928,7 +1141,7 @@ mod tests { use omicron_common::api::external::IdentityMetadataCreateParams; fn create_dpd_group( - sources: Option>, + sources: Option>, ) -> dpd_client::types::MulticastGroupExternalResponse { dpd_client::types::MulticastGroupExternalResponse { group_ip: "232.1.1.1".parse().unwrap(), @@ -981,15 +1194,15 @@ mod tests { // DPD has matching sources let dpd_group = create_dpd_group(Some(vec![ - dpd_client::types::IpSrc::Exact("10.0.0.1".parse().unwrap()), - dpd_client::types::IpSrc::Exact("10.0.0.2".parse().unwrap()), + IpSrc::Exact("10.0.0.1".parse().unwrap()), + IpSrc::Exact("10.0.0.2".parse().unwrap()), ])); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); // DPD has sources in different order (should still match) let dpd_group = create_dpd_group(Some(vec![ - dpd_client::types::IpSrc::Exact("10.0.0.2".parse().unwrap()), - dpd_client::types::IpSrc::Exact("10.0.0.1".parse().unwrap()), + IpSrc::Exact("10.0.0.2".parse().unwrap()), + IpSrc::Exact("10.0.0.1".parse().unwrap()), ])); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); @@ -999,8 +1212,8 @@ mod tests { // DPD has wrong sources (mismatch) let dpd_group = create_dpd_group(Some(vec![ - dpd_client::types::IpSrc::Exact("10.0.0.1".parse().unwrap()), - dpd_client::types::IpSrc::Exact("10.0.0.3".parse().unwrap()), // wrong + IpSrc::Exact("10.0.0.1".parse().unwrap()), + IpSrc::Exact("10.0.0.3".parse().unwrap()), // wrong ])); assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); } @@ -1023,8 +1236,8 @@ mod tests { // DPD should have specific sources (RFC 4607 compliance) let dpd_group = create_dpd_group(Some(vec![ - dpd_client::types::IpSrc::Exact("10.0.0.1".parse().unwrap()), - dpd_client::types::IpSrc::Exact("10.0.0.2".parse().unwrap()), + IpSrc::Exact("10.0.0.1".parse().unwrap()), + IpSrc::Exact("10.0.0.2".parse().unwrap()), ])); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); @@ -1034,9 +1247,8 @@ mod tests { } #[test] - fn test_dpd_state_matches_sources_asm_address() { - // ASM address with all members specifying sources: expect those - // sources in DPD. + fn test_dpd_state_matches_sources_asm_with_specific_sources() { + // ASM address with specific sources only (no any-source members) let source_filter = SourceFilterState { specific_sources: BTreeSet::from(["10.0.0.1" .parse::() @@ -1044,23 +1256,27 @@ mod tests { has_any_source_member: false, }; - let group = create_group("224.1.1.1"); // ASM address (not 232.x.x.x) + let group = create_group("224.1.1.1"); // ASM address - // DPD has matching sources (correct) - let dpd_group = - create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( - "10.0.0.1".parse().unwrap(), - )])); + // DPD has matching specific sources + let dpd_group = create_dpd_group(Some(vec![IpSrc::Exact( + "10.0.0.1".parse().unwrap(), + )])); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has None (mismatch: ASM with all-specific should have sources) + // DPD has None (mismatch: should have specific sources) let dpd_group = create_dpd_group(None); assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has IpSrc::Any (mismatch: should have specific sources) + let dpd_group = create_dpd_group(Some(vec![IpSrc::Any])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); } #[test] fn test_dpd_state_matches_sources_asm_with_any_source_member() { - // ASM address with has_any_source_member=true - expects None from DPD + // ASM address with has_any_source_member=true: we send None to DPD, + // and DPD canonicalizes any-source representations to None. let source_filter = SourceFilterState { specific_sources: BTreeSet::new(), has_any_source_member: true, @@ -1068,15 +1284,35 @@ mod tests { let group = create_group("224.1.1.1"); // ASM address - // DPD has None (correct for ASM with any-source members) + // DPD has None (correct: any-source canonicalizes to None) + let dpd_group = create_dpd_group(None); + assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has specific sources (mismatch) + let dpd_group = create_dpd_group(Some(vec![IpSrc::Exact( + "10.0.0.1".parse().unwrap(), + )])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + } + + #[test] + fn test_dpd_state_matches_sources_asm_no_sources() { + // ASM with no source filters at all expects None + let source_filter = SourceFilterState { + specific_sources: BTreeSet::new(), + has_any_source_member: false, + }; + + let group = create_group("224.1.1.1"); // ASM address + + // DPD has None (correct: no sources configured) let dpd_group = create_dpd_group(None); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has sources (mismatch: should be none) - let dpd_group = - create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( - "10.0.0.1".parse().unwrap(), - )])); + // DPD has sources (mismatch) + let dpd_group = create_dpd_group(Some(vec![IpSrc::Exact( + "10.0.0.1".parse().unwrap(), + )])); assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); } } diff --git a/nexus/src/app/background/tasks/multicast/members.rs b/nexus/src/app/background/tasks/multicast/members.rs index 1b7f81c6ab3..0ec903caea5 100644 --- a/nexus/src/app/background/tasks/multicast/members.rs +++ b/nexus/src/app/background/tasks/multicast/members.rs @@ -42,6 +42,12 @@ //! - **State transitions**: "Joining" → "Joined" → "Left" with reactivation //! - **Dataplane updates**: Applying and removing configuration via DPD //! client(s) on switches +//! - **M2P/forwarding propagation**: After join, leave, or migration, M2P +//! mappings and forwarding entries are propagated to all sleds via +//! sled-agent inline (not deferred to the next reconciliation pass) +//! - **OPTE subscriptions**: Per-instance multicast group filters managed +//! via sled-agent on the hosting sled (keyed by the active VMM's +//! propolis ID) //! - **Sled migration**: Detecting moves and updating dataplane configuration //! (no transition to "Left") //! - **Cleanup**: Removing orphaned switch state for deleted members @@ -97,7 +103,8 @@ //! | 3 | None | Valid | "Creating" | Wait for activation | "Left" | //! | 4 | None | Valid | "Active" | Reactivate member | "Joining" | -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::net::Ipv6Addr; use std::sync::Arc; use std::time::Instant; @@ -106,6 +113,7 @@ use futures::stream::{self, StreamExt}; use slog::{debug, info, trace, warn}; use uuid::Uuid; +use dpd_client::types::{BackplaneLink, Direction, LinkId, PortId, Rear}; use nexus_db_model::{ DbTypedUuid, MulticastGroup, MulticastGroupMember, MulticastGroupMemberState, MulticastGroupState, Sled, @@ -124,15 +132,70 @@ use omicron_uuid_kinds::{ use super::{MulticastGroupReconciler, StateTransition, SwitchBackplanePort}; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; +use crate::app::multicast::switch_zone::MulticastSwitchZoneClient; + +/// Pre-fetched instance state for multicast reconciliation. +#[derive(Clone, Copy, Debug, Default)] +struct InstanceMulticastState { + /// Whether the instance is in a state that can receive multicast traffic. + valid: bool, + /// Current sled hosting the VMM, if any. + sled_id: Option, +} + +/// Context shared across member reconciliation operations. +struct MemberReconcileCtx<'a> { + opctx: &'a OpContext, + group: &'a MulticastGroup, + member: &'a MulticastGroupMember, + instance_states: &'a InstanceStateMap, + dataplane_client: &'a MulticastDataplaneClient, + sled_client: &'a MulticastSledClient, + /// Sled-to-port mapping built once per reconciliation pass and shared + /// across all members in that pass (sled lookups in this map are O(1) + /// and never trigger I/O). + sled_to_ports: &'a HashMap>, +} + +/// Maps instance_id to pre-fetched multicast-relevant state. +type InstanceStateMap = HashMap; +type MemberPortKey = (PortId, LinkId); + +/// Sled-to-port mapping for a single reconciliation pass. +/// +/// `sled_to_ports` is the functional data we need. `ddm_inventory_drift` counts +/// sleds whose DDM port mapping diverged from inventory during a pass and is +/// reported for observability and (maybe) future signaling. +/// +/// TODO: A future change could use sustained drift to signal an inventory +/// refresh. +struct SledPortMap { + sled_to_ports: HashMap>, + ddm_inventory_drift: usize, +} -/// Pre-fetched instance state data for batch processing. -/// Maps instance_id -> (is_valid_for_multicast, current_sled_id). -type InstanceStateMap = HashMap)>; +impl SledPortMap { + fn empty() -> Self { + Self { sled_to_ports: HashMap::new(), ddm_inventory_drift: 0 } + } +} + +/// Outcome of a single [`MulticastGroupReconciler::reconcile_member_states`] +/// pass. +#[derive(Clone, Copy, Debug, Default)] +pub(super) struct MemberReconcileCounts { + /// Members whose state advanced this pass (e.g., "Joining" → "Joined", + /// "Joining" → "Left"). + pub(super) processed: usize, + /// Number of sleds whose DDM port mapping diverged from inventory. + /// DDM wins (live state); a non-zero count surfaces inventory lag. + pub(super) ddm_inventory_drift: usize, +} /// Backplane port mapping from DPD-client. /// Maps switch port ID to backplane link configuration. -type BackplaneMap = - BTreeMap; +type BackplaneMap = BTreeMap; /// Result of computing the union of member ports across a group. /// @@ -141,18 +204,18 @@ type BackplaneMap = /// the union is `Complete` to avoid disrupting members that failed resolution. enum MemberPortUnion { /// Union is complete: all "Joined" members were successfully resolved. - Complete(BTreeSet), + Complete(HashSet), /// Union is partial: some "Joined" members failed to resolve. /// The port set may be incomplete. - Partial(BTreeSet), + Partial(HashSet), } /// Check if a DPD member is a rear/underlay port (instance member). fn is_rear_underlay_member( member: &dpd_client::types::MulticastGroupMember, ) -> bool { - matches!(member.port_id, dpd_client::types::PortId::Rear(_)) - && member.direction == dpd_client::types::Direction::Underlay + matches!(member.port_id, PortId::Rear(_)) + && member.direction == Direction::Underlay } /// Represents a sled_id update for a multicast group member. @@ -168,33 +231,21 @@ trait MemberStateProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Joined" state. async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Left" state. async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; } @@ -205,61 +256,25 @@ impl MemberStateProcessor for InstanceMemberProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joining( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joining(ctx).await } async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joined( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joined(ctx).await } async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_left( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_left(ctx).await } } @@ -272,11 +287,13 @@ impl MulticastGroupReconciler { ]; /// Process member state changes ("Joining"→"Joined"→"Left"). - pub async fn reconcile_member_states( + pub(super) async fn reconcile_member_states( &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, - ) -> Result { + sled_client: &MulticastSledClient, + switch_zone_client: Option<&MulticastSwitchZoneClient>, + ) -> Result { trace!(opctx.log, "reconciling member state changes"); let mut processed = 0; @@ -284,9 +301,46 @@ impl MulticastGroupReconciler { // Get all groups that need member state processing ("Creating" and "Active") let groups = self.get_reconcilable_groups(opctx).await?; + // Build the reconciliation pass sled-to-port mapping once and share + // it across all members in this pass. Avoids per-member DDM RPCs + // and per-member inventory queries. + // + // A build failure (no DDM peers and no inventory yet) downgrades + // to an empty map: "Joining" → "Left" for stopped instances is a + // DB-only CAS that doesn't need a port lookup, so it still + // converges. Members that do need a port lookup (e.g. "Joining" + // → "Joined") fail their own processing this pass and retry on + // the next. + let SledPortMap { sled_to_ports, ddm_inventory_drift: drift_count } = + match self + .build_sled_port_map( + opctx, + dataplane_client, + switch_zone_client, + ) + .await + { + Ok(map) => map, + Err(e) => { + warn!( + opctx.log, + "failed to build reconciliation pass sled-to-port \ + mapping, continuing with empty map"; + "error" => %e, + ); + SledPortMap::empty() + } + }; + for group in groups { match self - .process_group_member_states(opctx, &group, dataplane_client) + .process_group_member_states( + opctx, + &group, + dataplane_client, + sled_client, + &sled_to_ports, + ) .await { Ok(count) => { @@ -314,10 +368,14 @@ impl MulticastGroupReconciler { debug!( opctx.log, "member state reconciliation completed"; - "members_processed" => processed + "members_processed" => processed, + "ddm_inventory_drift" => drift_count, ); - Ok(processed) + Ok(MemberReconcileCounts { + processed, + ddm_inventory_drift: drift_count, + }) } /// Process member state changes for a single group. @@ -326,6 +384,8 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, + sled_to_ports: &HashMap>, ) -> Result { let mut processed = 0; @@ -337,19 +397,21 @@ impl MulticastGroupReconciler { Arc::new(self.batch_fetch_instance_states(opctx, &members).await?); // Process members concurrently with configurable parallelism - let results = stream::iter(members) + let member_outcomes = stream::iter(members) .map(|member| { let instance_states = Arc::clone(&instance_states); async move { - let res = self - .process_member_state( - opctx, - group, - &member, - &instance_states, - dataplane_client, - ) - .await; + let ctx = MemberReconcileCtx { + opctx, + group, + member: &member, + instance_states: &instance_states, + dataplane_client, + sled_client, + sled_to_ports, + }; + + let res = self.process_member_state(&ctx).await; (member, res) } }) @@ -358,13 +420,13 @@ impl MulticastGroupReconciler { .await; // Process results and update counters - for (member, result) in results { + for (member, result) in member_outcomes { match result { Ok(transition) => match transition { StateTransition::StateChanged | StateTransition::NoChange => { processed += 1; - debug!( + trace!( opctx.log, "processed member state change"; "member" => ?member, @@ -374,7 +436,7 @@ impl MulticastGroupReconciler { } StateTransition::NeedsCleanup => { processed += 1; - debug!( + trace!( opctx.log, "member marked for cleanup"; "member" => ?member, @@ -382,7 +444,7 @@ impl MulticastGroupReconciler { ); } StateTransition::EntityGone => { - debug!( + trace!( opctx.log, "member deleted during processing"; "member" => ?member, @@ -407,15 +469,13 @@ impl MulticastGroupReconciler { /// Main dispatch function for processing member state changes. /// - /// Routes to appropriate node based on member type. + /// Routes to the appropriate handler based on member state. async fn process_member_state( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { + let MemberReconcileCtx { opctx, group, member, .. } = *ctx; + // Check if the parent group has been deleted or is being deleted. // If so, delete the member so cleanup can proceed. // @@ -447,40 +507,13 @@ impl MulticastGroupReconciler { match member.state { MulticastGroupMemberState::Joining => { - processor - .process_joining( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joining(self, ctx).await } MulticastGroupMemberState::Joined => { - processor - .process_joined( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joined(self, ctx).await } MulticastGroupMemberState::Left => { - processor - .process_left( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_left(self, ctx).await } } } @@ -495,7 +528,7 @@ impl MulticastGroupReconciler { ) -> Result { // Skip if member is already deleted if member.time_deleted.is_some() { - debug!( + trace!( opctx.log, "member already deleted, no action needed"; "member_id" => %member.id, @@ -532,35 +565,23 @@ impl MulticastGroupReconciler { /// when ready. Uses CAS operations for concurrent-safe state updates. async fn handle_instance_joining( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Extract pre-fetched instance state - let (instance_valid, current_sled_id) = - self.get_instance_state_from_cache(instance_states, member); + let instance_state = + self.get_instance_state_from_cache(ctx.instance_states, ctx.member); - // Execute reconciliation CAS operation let reconcile_res = self .execute_joining_reconciliation( - opctx, - group, - member, - instance_valid, - current_sled_id, + ctx, + instance_state.valid, + instance_state.sled_id, ) .await?; - // Process reconciliation result self.process_joining_reconcile_result( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, reconcile_res, - dataplane_client, ) .await } @@ -570,16 +591,14 @@ impl MulticastGroupReconciler { &self, instance_states: &InstanceStateMap, member: &MulticastGroupMember, - ) -> (bool, Option) { - instance_states.get(&member.parent_id).copied().unwrap_or((false, None)) + ) -> InstanceMulticastState { + instance_states.get(&member.parent_id).copied().unwrap_or_default() } /// Execute the reconciliation CAS operation for a member in "Joining" state. async fn execute_joining_reconciliation( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, instance_valid: bool, current_sled_id: Option, ) -> Result { @@ -587,9 +606,9 @@ impl MulticastGroupReconciler { self.datastore .multicast_group_member_reconcile_joining( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), instance_valid, current_sled_id_db, ) @@ -600,39 +619,26 @@ impl MulticastGroupReconciler { /// Process the result of a "Joining" state reconciliation operation. async fn process_joining_reconcile_result( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, reconcile_result: ReconcileJoiningResult, - dataplane_client: &MulticastDataplaneClient, ) -> Result { match reconcile_result.action { ReconcileAction::TransitionedToLeft => { - self.handle_transitioned_to_left(opctx, group, member).await + self.handle_transitioned_to_left(ctx).await } ReconcileAction::UpdatedSledId { old, new } => { self.handle_sled_id_updated( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, SledIdUpdate { old, new }, - dataplane_client, ) .await } ReconcileAction::NotFound | ReconcileAction::NoChange => { - self.handle_no_change_or_not_found( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.handle_no_change_or_not_found(ctx, instance_state).await } } } @@ -640,18 +646,16 @@ impl MulticastGroupReconciler { /// Handle the case where a member was transitioned to "Left" state. async fn handle_transitioned_to_left( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, ) -> Result { info!( - opctx.log, + ctx.opctx.log, "multicast member lifecycle transition: 'Joining' → 'Left'"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, "reason" => "instance_not_valid_for_multicast_traffic" ); Ok(StateTransition::StateChanged) @@ -660,63 +664,43 @@ impl MulticastGroupReconciler { /// Handle the case where a member's sled_id was updated. async fn handle_sled_id_updated( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, sled_id_update: SledIdUpdate, - dataplane_client: &MulticastDataplaneClient, ) -> Result { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "updated member sled_id, checking if ready to join"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "old_sled_id" => ?sled_id_update.old, "new_sled_id" => ?sled_id_update.new, - "group_state" => ?group.state, - "instance_valid" => instance_valid + "group_state" => ?ctx.group.state, + "instance_valid" => instance_state.valid ); - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } /// Handle the case where no changes were made or member was not found. async fn handle_no_change_or_not_found( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { // Check if member is already in Joined state - if member.state == MulticastGroupMemberState::Joined { - debug!( - opctx.log, + if ctx.member.state == MulticastGroupMemberState::Joined { + trace!( + ctx.opctx.log, "member already in 'Joined' state, no action needed"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str() ); return Ok(StateTransition::NoChange); } // Try to complete the join if conditions are met - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } fn is_ready_to_join( @@ -729,30 +713,25 @@ impl MulticastGroupReconciler { async fn try_complete_join_if_ready( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { - if self.is_ready_to_join(group, instance_valid) { - self.complete_instance_member_join( - opctx, - group, - member, - dataplane_client, - ) - .await?; - Ok(StateTransition::StateChanged) + if self.is_ready_to_join(ctx.group, instance_state.valid) { + let joined = self.complete_instance_member_join(ctx, None).await?; + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } else { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "member not ready to join: waiting for next run"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "instance_valid" => instance_valid, - "group_state" => ?group.state + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "instance_valid" => instance_state.valid, + "group_state" => ?ctx.group.state ); Ok(StateTransition::NoChange) } @@ -761,82 +740,47 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Joined" state. async fn handle_instance_joined( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let instance_state = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); + .unwrap_or_default(); - match (instance_valid, current_sled_id) { - // Invalid instance -> remove from dataplane and transition to "Left" - (false, _) => { - self.handle_invalid_instance( - opctx, - group, - member, - dataplane_client, - ) - .await - } + match (instance_state.valid, instance_state.sled_id) { + (false, _) => self.handle_invalid_instance(ctx).await, - // Valid instance with sled, but sled changed (migration) - (true, Some(sled_id)) if member.sled_id != Some(sled_id.into()) => { - self.handle_sled_migration( - opctx, - group, - member, - sled_id, - dataplane_client, - ) - .await + (true, Some(sled_id)) + if ctx.member.sled_id != Some(sled_id.into()) => + { + self.handle_sled_migration(ctx, sled_id).await } - // Valid instance with sled, sled unchanged -> verify configuration (true, Some(_)) => { - self.verify_members(opctx, group, member, dataplane_client) - .await?; + self.verify_members(ctx).await?; trace!( - opctx.log, + ctx.opctx.log, "member configuration verified, no changes needed"; - "member_id" => %member.id, - "group_id" => %group.id() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() ); Ok(StateTransition::NoChange) } - // Valid instance but no sled_id (shouldn't typically happen in "Joined" state) - (true, None) => { - self.handle_joined_without_sled( - opctx, - group, - member, - dataplane_client, - ) - .await - } + (true, None) => self.handle_joined_without_sled(ctx).await, } } /// Handle a joined member whose instance became invalid. async fn handle_invalid_instance( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { + let MemberReconcileCtx { opctx, group, member, sled_client, .. } = ctx; // Remove from dataplane first - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { - debug!( + if let Err(e) = self.remove_member_from_dataplane(ctx).await { + warn!( opctx.log, "failed to remove member from dataplane, will retry"; "member_id" => %member.id, @@ -845,6 +789,24 @@ impl MulticastGroupReconciler { return Err(e); } + // Unsubscribe the instance from the multicast group before the CAS + // clears the sled ID. Best-effort since the VMM may already be torn + // down. + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_instance(opctx, group, member, sled_id.into()) + .await + { + warn!( + opctx.log, + "failed to unsubscribe instance during instance invalidation"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Update database state (atomically set "Left" and clear `sled_id`) let updated = self .datastore @@ -870,6 +832,21 @@ impl MulticastGroupReconciler { return Ok(StateTransition::NoChange); } + // Propagate updated M2P/forwarding to all sleds so the + // dataplane reflects the member's departure. Best-effort since + // group reconciliation will converge if this fails. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after member leave"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } + info!( opctx.log, "multicast member lifecycle transition: 'Joined' → 'Left' (instance invalid)"; @@ -877,7 +854,6 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "group_id" => %group.id(), "group_multicast_ip" => %group.multicast_ip, - "dpd_operation" => "remove_member_from_underlay_group", "reason" => "instance_no_longer_valid_for_multicast_traffic" ); Ok(StateTransition::StateChanged) @@ -886,46 +862,50 @@ impl MulticastGroupReconciler { /// Handle sled migration for a "Joined" member. async fn handle_sled_migration( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, new_sled_id: SledUuid, - dataplane_client: &MulticastDataplaneClient, ) -> Result { info!( - opctx.log, + ctx.opctx.log, "detected sled migration for 'Joined' member: re-applying configuration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); // Remove from old sled's dataplane first - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { - debug!( - opctx.log, + if let Err(e) = self.remove_member_from_dataplane(ctx).await { + warn!( + ctx.opctx.log, "failed to remove member from old sled, will retry"; - "member_id" => %member.id, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "old_sled_id" => ?ctx.member.sled_id, "error" => ?e ); return Err(e); } - // Update sled_id in database using CAS + // Source-sled OPTE cleanup (M2P, forwarding, port subscription) + // is handled by VMM teardown: remove_propolis_zone -> + // release_opte_ports -> PortTicket::release_inner, which + // clears multicast subscriptions along with V2P and firewall + // rules. + // + // This is consistent with all other OPTE state. Nexus + // never explicitly calls sled-agent for source-sled cleanup + // after migration. + + // Update `sled_id` in database using CAS let updated = self .datastore .multicast_group_member_update_sled_id_if_current( - opctx, - InstanceUuid::from_untyped_uuid(member.parent_id), - member.sled_id, + ctx.opctx, + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + ctx.member.sled_id, Some(new_sled_id.into()), ) .await @@ -935,49 +915,46 @@ impl MulticastGroupReconciler { if !updated { debug!( - opctx.log, + ctx.opctx.log, "skipping sled_id update after migration due to concurrent change"; - "member_id" => %member.id, - "group_id" => %group.id(), - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); return Ok(StateTransition::NoChange); } - // Re-apply configuration on new sled - // If this fails (e.g., sled not yet in inventory), transition to "Joining" for retry - match self - .complete_instance_member_join( - opctx, - group, - member, - dataplane_client, - ) - .await - { - Ok(()) => { + // Re-apply configuration on new sled. Pass `new_sled_id` explicitly + // because the in-memory member struct still has the old sled_id. + match self.complete_instance_member_join(ctx, Some(new_sled_id)).await { + Ok(joined) => { info!( - opctx.log, + ctx.opctx.log, "member configuration re-applied after sled migration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, "new_sled_id" => %new_sled_id, - "dpd_operation" => "re_add_member_to_underlay_multicast_group" + "action" => "re_add_member_to_underlay_multicast_group", + "joined" => joined ); - Ok(StateTransition::StateChanged) + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } Err(e) => { // Failed to join on new sled. We transition to "Joining" and // retry next cycle/run. warn!( - opctx.log, + ctx.opctx.log, "failed to complete join on new sled after migration: transitioning to 'Joining' for retry"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id, "error" => %e ); @@ -1005,9 +982,9 @@ impl MulticastGroupReconciler { let updated = self .datastore .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), MulticastGroupMemberState::Joined, MulticastGroupMemberState::Joining, ) @@ -1018,10 +995,10 @@ impl MulticastGroupReconciler { if updated { info!( - opctx.log, + ctx.opctx.log, "member transitioned to 'Joining': will retry on next reconciliation run"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id ); Ok(StateTransition::StateChanged) @@ -1036,11 +1013,9 @@ impl MulticastGroupReconciler { /// Handle edge case where a "Joined" member has no sled_id. async fn handle_joined_without_sled( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; warn!( opctx.log, "'Joined' member has no sled_id: transitioning to 'Left'"; @@ -1049,10 +1024,7 @@ impl MulticastGroupReconciler { ); // Remove from dataplane and transition to "Left" - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { + if let Err(e) = self.remove_member_from_dataplane(ctx).await { warn!( opctx.log, "failed to remove member with no sled_id from dataplane"; @@ -1094,7 +1066,7 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "group_id" => %group.id(), "group_multicast_ip" => %group.multicast_ip, - "dpd_operation" => "remove_member_from_underlay_group", + "action" => "transition_to_left", "reason" => "inconsistent_state_sled_id_missing_in_joined_state" ); Ok(StateTransition::StateChanged) @@ -1103,22 +1075,20 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Left" state. async fn handle_instance_left( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let InstanceMulticastState { + valid: instance_valid, + sled_id: current_sled_id, + .. + } = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); + .unwrap_or_default(); - // Handle permanent deletion first - if member.time_deleted.is_some() { - self.cleanup_deleted_member(opctx, group, member, dataplane_client) - .await?; + if ctx.member.time_deleted.is_some() { + self.cleanup_deleted_member(ctx).await?; return Ok(StateTransition::NeedsCleanup); } @@ -1128,28 +1098,43 @@ impl MulticastGroupReconciler { // The cleanup is idempotent and handles cases where: // - sled_id is None (uses fallback path) // - member was already removed from DPD - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { - debug!( - opctx.log, + if let Err(e) = self.remove_member_from_dataplane(ctx).await { + warn!( + ctx.opctx.log, "failed to clean up DPD state for 'Left' member (will retry)"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "error" => ?e ); - // Continue to reactivation even on cleanup failure because - // the add operation may succeed if the port was already removed } - // Handle reactivation: instance valid and group active -> transition to "Joining" - if instance_valid && group.state == MulticastGroupState::Active { - return self - .reactivate_left_member(opctx, group, member, current_sled_id) - .await; + // Unsubscribe the instance's active VMM OPTE port from this multicast + // group. Best-effort since if the VMM is already gone, there's + // nothing to unsubscribe (the OPTE port was destroyed with the VMM). + if let Some(sled_id) = ctx.member.sled_id { + if let Err(e) = ctx + .sled_client + .unsubscribe_instance( + ctx.opctx, + ctx.group, + ctx.member, + sled_id.into(), + ) + .await + { + warn!( + ctx.opctx.log, + "failed to unsubscribe instance from multicast group"; + "member_id" => %ctx.member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + + if instance_valid && ctx.group.state == MulticastGroupState::Active { + return self.reactivate_left_member(ctx, current_sled_id).await; } - // Stay in "Left" state Ok(StateTransition::NoChange) } @@ -1157,11 +1142,10 @@ impl MulticastGroupReconciler { /// Transitions the member back to "Joining" state so it can rejoin the group. async fn reactivate_left_member( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, current_sled_id: Option, ) -> Result { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; debug!( opctx.log, "transitioning member from 'Left' to 'Joining': instance became valid and group active"; @@ -1250,10 +1234,10 @@ impl MulticastGroupReconciler { // Build the state map from the fetched data state_map.extend(members.iter().map(|member| { - let (is_valid, sled_id) = if let Some((instance, vmm_opt)) = + let state = if let Some((instance, vmm_opt)) = instance_vmm_data.get(&member.parent_id) { - let is_valid = matches!( + let valid = matches!( instance.nexus_state.state(), InstanceState::Creating | InstanceState::Starting @@ -1267,13 +1251,12 @@ impl MulticastGroupReconciler { SledUuid::from_untyped_uuid(vmm.sled_id.into_untyped_uuid()) }); - (is_valid, sled_id) + InstanceMulticastState { valid, sled_id } } else { - // Instance not found (mark as invalid) - (false, None) + InstanceMulticastState::default() }; - (member.parent_id, (is_valid, sled_id)) + (member.parent_id, state) })); debug!( @@ -1292,9 +1275,9 @@ impl MulticastGroupReconciler { /// Returns `None` if the instance has no sled assignment or cannot be found. async fn lookup_and_update_member_sled_id( &self, - opctx: &OpContext, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, ) -> Result>, anyhow::Error> { + let MemberReconcileCtx { opctx, member, .. } = ctx; debug!( opctx.log, "member has no sled_id, attempting to look up instance sled"; @@ -1319,13 +1302,13 @@ impl MulticastGroupReconciler { return Ok(None); } Err(e) => { - debug!( + warn!( opctx.log, "failed to look up instance state"; "member" => ?member, "error" => ?e ); - return Ok(None); + return Err(e.into()); } }; @@ -1381,87 +1364,147 @@ impl MulticastGroupReconciler { } } - /// Complete a member join operation ("Joining" -> "Joined") for an instance. + /// Complete a member join by configuring the dataplane and subscribing + /// the VMM. + /// + /// When `sled_id_override` is provided (e.g., during migration), it + /// is used instead of the potentially stale `member.sled_id`. + /// + /// # Returns + /// + /// `Ok(true)` when the join completed successfully. `Ok(false)` when no + /// sled was available and the operation was a noop. async fn complete_instance_member_join( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, - ) -> Result<(), anyhow::Error> { + ctx: &MemberReconcileCtx<'_>, + sled_id_override: Option, + ) -> Result { debug!( - opctx.log, + ctx.opctx.log, "completing member join"; - "member" => ?member, - "group" => ?group + "member" => ?ctx.member, + "group" => ?ctx.group ); - // Get sled_id from member record, or look it up and update if missing - let sled_id = match member.sled_id { - Some(id) => id, - None => { - match self - .lookup_and_update_member_sled_id(opctx, member) - .await? - { - Some(id) => id, - None => return Ok(()), // No sled available, cannot join - } - } + // Use the override if provided, then the member's cached sled_id, + // then look it up from the instance as a last resort. + let sled_id: SledUuid = if let Some(id) = + sled_id_override.or(ctx.member.sled_id.map(Into::into)) + { + id + } else if let Some(id) = + self.lookup_and_update_member_sled_id(ctx).await? + { + id.into() + } else { + return Ok(false); }; - self.add_member_to_dataplane( - opctx, - group, - member, - sled_id.into(), - dataplane_client, - ) - .await?; + self.add_member_to_dataplane(ctx, sled_id).await?; - // Transition to "Joined" state (only if still in "Joining") - let updated = self - .datastore - .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), - MulticastGroupMemberState::Joining, - MulticastGroupMemberState::Joined, - ) + // If the member is already in a "Joined" state (migration path), skip + // the state transition but still propagate and subscribe. During + // migration the caller updates the sled ID without changing state, + // so we must not gate propagation on this CAS. + if ctx.member.state != MulticastGroupMemberState::Joined { + let updated = self + .datastore + .multicast_group_member_set_state_if_current( + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + MulticastGroupMemberState::Joining, + MulticastGroupMemberState::Joined, + ) + .await + .context( + "failed to conditionally transition member to 'Joined' state", + )?; + + if !updated { + debug!( + ctx.opctx.log, + "skipping Joining→Joined transition due to concurrent update"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() + ); + // Concurrent update moved the member away from the "Joining" + // state, so skip propagation and subscribe. + return Ok(false); + } + } + + // Propagate M2P mappings and forwarding entries to all sleds. + // + // Athis point, the member is now "Joined" in the database, so propagate + // includes this sled in forwarding next-hops. If propagation or + // subscribe fails below, the member remains "Joined" with incomplete + // sled state. The reconciler's next pass converges via + // `handle_instance_joined` -> `verify_members`. + // + // Propagation failures are best-effort since the reconciler will + // re-converge all sleds on the next cycle. Subscribe failures + // below are treated as hard errors because the VMM cannot + // receive traffic without an OPTE port subscription. + if let Err(e) = ctx + .sled_client + .propagate_m2p_and_forwarding(ctx.opctx, ctx.group) .await - .context( - "failed to conditionally transition member to 'Joined' state", - )?; - if !updated { - debug!( - opctx.log, - "skipping Joining→Joined transition due to concurrent update"; - "member_id" => %member.id, - "group_id" => %group.id() + { + warn!( + ctx.opctx.log, + "failed to propagate M2P/forwarding after member join"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "error" => %e ); } + // Subscribe the instance's active VMM OPTE port last. Propagation + // above is best-effort, and any sleds that failed will be converged + // by the reconciler on the next cycle. + if let Err(e) = ctx + .sled_client + .subscribe_instance(ctx.opctx, ctx.group, ctx.member, sled_id) + .await + { + warn!( + ctx.opctx.log, + "failed to subscribe instance to multicast group via sled-agent \ + (will retry next cycle)"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "sled_id" => %sled_id, + "error" => %e + ); + return Err(e); + } + info!( - opctx.log, + ctx.opctx.log, "member join completed"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "sled_id" => %sled_id ); - Ok(()) + Ok(true) } /// Apply member dataplane configuration (via DPD-client). async fn add_member_to_dataplane( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, sled_id: SledUuid, - dataplane_client: &MulticastDataplaneClient, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { + opctx, + group, + member, + dataplane_client, + sled_to_ports, + .. + } = ctx; let underlay_group_id = group.underlay_group_id.with_context(|| { format!("no underlay group for external group {}", group.id()) })?; @@ -1475,10 +1518,9 @@ impl MulticastGroupReconciler { )?; // Resolve sled to switch port configurations - let port_configs = self - .resolve_sled_to_switch_ports(opctx, sled_id, dataplane_client) - .await - .context("failed to resolve sled to switch ports")?; + let port_configs = + Self::resolve_sled_to_switch_ports(sled_to_ports, sled_id) + .context("failed to resolve sled to switch ports")?; for port_config in &port_configs { let dataplane_member = dpd_client::types::MulticastGroupMember { @@ -1528,18 +1570,84 @@ impl MulticastGroupReconciler { Ok(()) } - /// Remove member from known port configurations. - async fn remove_from_known_ports( - &self, - opctx: &OpContext, - member: &MulticastGroupMember, - sled_id: DbTypedUuid, - port_configs: &[SwitchBackplanePort], - underlay_group: &nexus_db_model::UnderlayMulticastGroup, - dataplane_client: &MulticastDataplaneClient, - ) -> Result<(), anyhow::Error> { - // Remove member from DPD for each port on the sled - for port_config in port_configs { + /// Remove member from known port configurations. + /// + /// Multicast underlay membership is keyed by (port, link), not by + /// member: the DPD member table tracks one entry per + /// (group, port_id, link_id), so multiple members sharing a rear + /// port collapse to one entry per group. + /// + /// Compute the union of active rear ports across other "Joined" members + /// in the group and skip any port still in use, so that removing one + /// member does not tear down forwarding for siblings on the same sled. + async fn remove_from_known_ports( + &self, + ctx: &MemberReconcileCtx<'_>, + sled_id: DbTypedUuid, + port_configs: &[SwitchBackplanePort], + underlay_group: &nexus_db_model::UnderlayMulticastGroup, + ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { + opctx, + member, + dataplane_client, + sled_to_ports, + .. + } = *ctx; + + let active_member_ports = match self + .compute_active_member_ports( + opctx, + member.external_group_id, + sled_to_ports, + Some(member.id.into_untyped_uuid()), + ) + .await + { + Ok(MemberPortUnion::Complete(ports)) => Some(ports), + Ok(MemberPortUnion::Partial(_)) => { + // Some other "Joined" members failed to resolve. Skip + // pruning to avoid withdrawing ports that may still be in + // use (reconciliation will retry). + info!( + opctx.log, + "union incomplete: skipping known-port removal to avoid disrupting unresolved members"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "reason" => "some_joined_members_failed_port_resolution" + ); + return Ok(()); + } + Err(e) => { + info!( + opctx.log, + "failed to compute active member ports: skipping known-port removal"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + return Ok(()); + } + }; + + let (to_retain, to_remove): (Vec<_>, Vec<_>) = + port_configs.iter().partition(|pc| { + active_member_ports.as_ref().is_some_and(|active| { + active.contains(&(pc.port_id.clone(), pc.link_id)) + }) + }); + + for port_config in &to_retain { + debug!( + opctx.log, + "retaining shared rear port still in use by other group members"; + "member_id" => %member.id, + "port_id" => %port_config.port_id, + "sled_id" => %sled_id, + ); + } + + for port_config in &to_remove { let dataplane_member = dpd_client::types::MulticastGroupMember { port_id: port_config.port_id.clone(), link_id: port_config.link_id, @@ -1555,10 +1663,13 @@ impl MulticastGroupReconciler { opctx.log, "member removed from DPD"; "port_id" => %port_config.port_id, - "sled_id" => %sled_id + "sled_id" => %sled_id, ); } + let removed = to_remove.len(); + let retained = to_retain.len(); + info!( opctx.log, "multicast member configuration removed from switch forwarding tables"; @@ -1566,6 +1677,8 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "sled_id" => %sled_id, "port_count" => port_configs.len(), + "ports_removed" => removed, + "ports_retained_shared" => retained, "dpd_operation" => "remove_member_from_underlay_multicast_group", "reason" => "instance_state_change_or_migration" ); @@ -1583,7 +1696,7 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, group_id: Uuid, - dataplane_client: &MulticastDataplaneClient, + sled_to_ports: &HashMap>, exclude_member_id: Option, ) -> Result { let group_members = self @@ -1616,14 +1729,10 @@ impl MulticastGroupReconciler { }; // Attempt to resolve sled to switch ports - match self - .resolve_sled_to_switch_ports( - opctx, - mem_sled_id.into(), - dataplane_client, - ) - .await - { + match Self::resolve_sled_to_switch_ports( + sled_to_ports, + mem_sled_id.into(), + ) { Ok(ports) => Some((mem, ports)), Err(e) => { warn!( @@ -1656,9 +1765,10 @@ impl MulticastGroupReconciler { link_id: cfg.link_id, direction: cfg.direction, }; - is_rear_underlay_member(&member).then(|| cfg.port_id) + is_rear_underlay_member(&member) + .then(|| (cfg.port_id, cfg.link_id)) }) - .collect::>(); + .collect::>(); // Return `Complete` or `Partial` based on whether all members resolved if failure_cnt == 0 { @@ -1676,6 +1786,7 @@ impl MulticastGroupReconciler { member: &MulticastGroupMember, underlay_group: &nexus_db_model::UnderlayMulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_to_ports: &HashMap>, ) -> Result<(), anyhow::Error> { // Sled resolution failed or no sled_id available (e.g., removed // from inventory, or member.sled_id=NULL). @@ -1704,7 +1815,7 @@ impl MulticastGroupReconciler { .compute_active_member_ports( opctx, member.external_group_id, - dataplane_client, + sled_to_ports, Some(member.id.into_untyped_uuid()), ) .await @@ -1741,7 +1852,9 @@ impl MulticastGroupReconciler { } // Remove only if not in union of active member ports - if !active_member_ports.contains(¤t_member.port_id) { + let member_key: MemberPortKey = + (current_member.port_id.clone(), current_member.link_id); + if !active_member_ports.contains(&member_key) { dataplane_client .remove_member(underlay_group, current_member.clone()) .await @@ -1764,18 +1877,16 @@ impl MulticastGroupReconciler { /// Remove member dataplane configuration (via DPD-client). async fn remove_member_from_dataplane( &self, - opctx: &OpContext, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { - let group = self - .datastore - .multicast_group_fetch( - opctx, - MulticastGroupUuid::from_untyped_uuid(member.external_group_id), - ) - .await - .context("failed to fetch group for member removal")?; + let MemberReconcileCtx { + opctx, + group, + member, + dataplane_client, + sled_to_ports, + .. + } = ctx; let underlay_group_id = group.underlay_group_id.with_context(|| { format!( @@ -1792,21 +1903,15 @@ impl MulticastGroupReconciler { // Try to remove via known ports if we have a `sled_id` and can resolve it if let Some(sled_id) = member.sled_id { - if let Ok(port_configs) = self - .resolve_sled_to_switch_ports( - opctx, - sled_id.into(), - dataplane_client, - ) - .await - { + if let Ok(port_configs) = Self::resolve_sled_to_switch_ports( + sled_to_ports, + sled_id.into(), + ) { self.remove_from_known_ports( - opctx, - member, + ctx, sled_id, &port_configs, &underlay_group, - dataplane_client, ) .await?; return Ok(()); @@ -1820,6 +1925,7 @@ impl MulticastGroupReconciler { member, &underlay_group, dataplane_client, + sled_to_ports, ) .await?; @@ -1830,11 +1936,9 @@ impl MulticastGroupReconciler { /// Ensures dataplane consistency by failing if removal operations fail. async fn cleanup_member_from_dataplane( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; debug!( opctx.log, "cleaning up member from dataplane"; @@ -1846,11 +1950,9 @@ impl MulticastGroupReconciler { ); // Strict removal from dataplane (fail on errors) - self.remove_member_from_dataplane(opctx, member, dataplane_client) - .await - .context( - "failed to remove member configuration via DPD during cleanup", - )?; + self.remove_member_from_dataplane(ctx).await.context( + "failed to remove member configuration via DPD during cleanup", + )?; info!( opctx.log, @@ -1870,15 +1972,25 @@ impl MulticastGroupReconciler { /// - Removing the member from any unexpected/stale rear ports /// - Adding the member to expected ports /// + /// If the sled cannot be resolved (e.g., decommissioned), the member + /// is transitioned to "Left" and M2P/forwarding is propagated inline + /// to remove stale entries. + /// /// This handles cases like `sp_slot` changes where the sled's physical /// location changed but the `sled_id` stayed the same. async fn verify_members( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { + opctx, + group, + member, + dataplane_client, + sled_client, + sled_to_ports, + .. + } = ctx; debug!( opctx.log, "verifying joined member consistency"; @@ -1910,15 +2022,12 @@ impl MulticastGroupReconciler { .await .context("failed to fetch underlay group")?; - // Resolve expected member configurations (may refresh cache if TTL expired) - let expected_port_configs = match self - .resolve_sled_to_switch_ports( - opctx, - sled_id.into(), - dataplane_client, - ) - .await - { + // Resolve expected member configurations from the reconciliation + // pass map. + let expected_port_configs = match Self::resolve_sled_to_switch_ports( + sled_to_ports, + sled_id.into(), + ) { Ok(configs) => configs, Err(e) => { // If we can't resolve the sled anymore (e.g., removed from inventory), @@ -1932,13 +2041,24 @@ impl MulticastGroupReconciler { ); // Best effort removal on verification - let _ = self - .remove_member_from_dataplane( - opctx, - member, - dataplane_client, - ) - .await; + let _ = self.remove_member_from_dataplane(ctx).await; + + // Unsubscribe the instance before the CAS clears sled_id; + // otherwise, the OPTE subscription is stranded with no + // way to identify the sled on later passes. Best-effort + // since the VMM may already be torn down. + if let Err(e) = sled_client + .unsubscribe_instance(opctx, group, member, sled_id.into()) + .await + { + warn!( + opctx.log, + "failed to unsubscribe instance during port resolution failure"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } let updated = self .datastore @@ -1952,6 +2072,21 @@ impl MulticastGroupReconciler { .context("failed to transition member to 'Left' after port resolution failure")?; if updated { + // Propagate updated M2P/forwarding to remove + // stale entries for this now-Left member. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + member left due to unresolvable sled"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } info!( opctx.log, "member transitioned to 'Left': sled no longer resolvable"; @@ -1979,7 +2114,7 @@ impl MulticastGroupReconciler { .compute_active_member_ports( opctx, group.id(), - dataplane_client, + sled_to_ports, None, // Don't exclude any member ) .await @@ -2023,7 +2158,11 @@ impl MulticastGroupReconciler { } // If this port is not in our active member set, it's stale - if !active_ports.contains(¤t_member.port_id) { + let member_key: MemberPortKey = ( + current_member.port_id.clone(), + current_member.link_id, + ); + if !active_ports.contains(&member_key) { stale_ports.push(current_member.clone()); } } @@ -2105,6 +2244,24 @@ impl MulticastGroupReconciler { } } + // Ensure the instance subscription is in place. Sled-agent resolves + // the active VMM under its per-instance state lock, which keeps this + // call correct across live-migration propolis_id changes when the + // sled_id stays the same. The call is idempotent. + if let Err(e) = sled_client + .subscribe_instance(opctx, group, member, sled_id.into()) + .await + { + warn!( + opctx.log, + "failed to verify instance subscription during member verification"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + return Err(e); + } + info!( opctx.log, "member verification completed"; @@ -2228,52 +2385,6 @@ impl MulticastGroupReconciler { .context("failed to list group members") } - /// Check cache for a sled mapping. - async fn check_sled_cache( - &self, - cache_key: SledUuid, - ) -> Option> { - let cache = self.sled_mapping_cache.read().await; - let (cached_at, mappings) = &*cache; - let elapsed = cached_at.elapsed(); - - if elapsed < self.sled_cache_ttl { - mappings.get(&cache_key).cloned() - } else { - None - } - } - - /// Detect backplane topology change and invalidate sled cache if needed. - /// - /// Compares the full (PortId, BackplaneLink) pairs to detect changes in: - /// - Port count (sleds added/removed) - /// - Port IDs (different physical slots) - /// - Link attributes (speed, lanes, connector type changes) - async fn handle_backplane_topology_change( - &self, - opctx: &OpContext, - previous_map: &Option, - new_map: &BackplaneMap, - ) { - if let Some(prev_map) = previous_map { - // Compare full maps (keys + values) to detect any topology changes - if prev_map != new_map { - info!( - opctx.log, - "backplane map topology change detected"; - "previous_port_count" => prev_map.len(), - "new_port_count" => new_map.len() - ); - info!( - opctx.log, - "invalidating sled mapping cache due to backplane topology change" - ); - self.invalidate_sled_mapping_cache().await; - } - } - } - /// Fetch the backplane map from DPD-client with caching. /// /// The client responds with the entire mapping of all cubbies in a rack. @@ -2285,13 +2396,10 @@ impl MulticastGroupReconciler { opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, ) -> Result { - // Check cache first - let previous_map = { + { let cache = self.backplane_map_cache.read().await; if let Some((cached_at, ref map)) = *cache { - let elapsed = cached_at.elapsed(); - - if elapsed < self.backplane_cache_ttl { + if cached_at.elapsed() < self.backplane_cache_ttl { trace!( opctx.log, "backplane map cache hit"; @@ -2299,14 +2407,9 @@ impl MulticastGroupReconciler { ); return Ok(map.clone()); } - // Cache expired but keep reference to previous map for comparison - Some(map.clone()) - } else { - None } - }; + } - // Fetch from DPD via dataplane client on cache miss debug!( opctx.log, "fetching backplane map from DPD (cache miss or stale)" @@ -2317,69 +2420,161 @@ impl MulticastGroupReconciler { "failed to query backplane_map from DPD via dataplane client", )?; - // Detect topology change and invalidate sled cache if needed - self.handle_backplane_topology_change( - opctx, - &previous_map, - &backplane_map, - ) - .await; - info!( opctx.log, "fetched backplane map from DPD"; "port_count" => backplane_map.len() ); - // Update cache let mut cache = self.backplane_map_cache.write().await; *cache = Some((Instant::now(), backplane_map.clone())); Ok(backplane_map) } - /// Resolve a sled ID to switch ports for multicast traffic. - pub async fn resolve_sled_to_switch_ports( + /// Build the reconciliation pass sled-to-port mapping. + /// + /// Tries DDM peer topology first (live, authoritative for reachable + /// sleds) when a switch-zone client is available. Falls back to + /// inventory + DPD backplane validation when DDM is unavailable, + /// returns an empty result, or no switch-zone client could be built + /// this pass. The returned map is consumed by a single reconciler + /// pass and dropped afterward, so peer-state churn between passes + /// resolves on the next tick. + async fn build_sled_port_map( &self, opctx: &OpContext, - sled_id: SledUuid, dataplane_client: &MulticastDataplaneClient, - ) -> Result, anyhow::Error> { - // Check cache first - if let Some(port_configs) = self.check_sled_cache(sled_id).await { - return Ok(port_configs); - } + switch_zone_client: Option<&MulticastSwitchZoneClient>, + ) -> Result { + // Fetch DPD's backplane map once per reconciliation pass. It accounts + // for the enumeration of valid PortId values (regardless of how + // a peer's `if_name` ~ interface name ~ is shaped), so we use it to + // cross-validate parsed DDM peers and to ground the inventory + // fallback's slot lookups. + let backplane_map = + self.fetch_backplane_map(opctx, dataplane_client).await?; - // Refresh cache if stale or missing entry - if let Err(e) = - self.refresh_sled_mapping_cache(opctx, dataplane_client).await - { - warn!( + // List in-service sleds once per reconciliation pass and share with + // both resolution paths, avoiding duplicate DB queries. + let sleds = self + .datastore + .sled_list_all_batched(opctx, SledFilter::InService) + .await + .context("failed to list in-service sleds")?; + + // Prefer DDM: it reflects live peer status (link state, cable + // up/down). Inventory is a periodic collection snapshot and can + // lag actual topology. DDM may also be partial (a flapping link + // can drop a sled out of peers temporarily, or test/sim + // populates DDM from an earlier inventory snapshot); when it + // is, fill gaps from inventory rather than treat the partial + // result as authoritative. + let mut mappings = match switch_zone_client { + Some(switch_zone_client) => self + .fetch_sled_mapping_from_ddm( + opctx, + switch_zone_client, + &backplane_map, + &sleds, + ) + .await + .unwrap_or_else(|e| { + debug!( + opctx.log, + "DDM peer resolution unavailable, relying on inventory"; + "error" => %e, + ); + HashMap::new() + }), + None => HashMap::new(), + }; + let mut drift_count = 0usize; + + if mappings.len() < sleds.len() { + debug!( opctx.log, - "failed to refresh sled mapping cache, using stale data"; - "sled_id" => %sled_id, - "error" => %e + "supplementing DDM-derived mapping with inventory fallback"; + "in_service_sleds" => sleds.len(), + "ddm_mapped_sleds" => mappings.len(), ); - // Try cache again even with stale data - if let Some(port_configs) = self.check_sled_cache(sled_id).await { - return Ok(port_configs); + // If inventory itself fails, keep whatever DDM gave us. + // Discarding the partial DDM map on inventory failure would + // strand all members for this pass when DDM had useful data + // we could have used. Next pass retries. + match self + .fetch_sled_mapping_from_inventory( + opctx, + dataplane_client, + backplane_map, + &sleds, + ) + .await + { + Ok(inventory_map) => { + // Surface inventory-vs-DDM drift signals before + // merging. (a) DDM-only: DDM lists a sled missing + // from the latest inventory collection, typical + // when inventory hasn't caught up to a + // freshly-attached sled. (b) Disagreement: both + // have the sled but with different port info; DDM + // wins (live state), but the inventory lag is + // worth flagging. + // + // TODO: surface this drift as an observability + // signal rather than reconciliation pass logs. + for (sled_id, ddm_ports) in &mappings { + match inventory_map.get(sled_id) { + None => info!( + opctx.log, + "DDM is ahead of inventory, as sled in DDM peers but not in latest inventory"; + "sled_id" => %sled_id, + ), + Some(inv_ports) if inv_ports != ddm_ports => { + warn!( + opctx.log, + "DDM and inventory disagree on sled port mapping, preferring DDM"; + "sled_id" => %sled_id, + ); + drift_count += 1; + } + Some(_) => {} + } + } + + for (sled_id, ports) in inventory_map { + mappings.entry(sled_id).or_insert(ports); + } + } + Err(e) => { + warn!( + opctx.log, + "inventory fallback failed, proceeding with partial DDM map"; + "ddm_mapped_sleds" => mappings.len(), + "in_service_sleds" => sleds.len(), + "error" => %e, + ); + } } - // If cache refresh failed and no stale data, propagate error - return Err(e.context("failed to refresh sled mapping cache and no cached data available")); } - // Try cache again after successful refresh - if let Some(port_configs) = self.check_sled_cache(sled_id).await { - return Ok(port_configs); - } + Ok(SledPortMap { + sled_to_ports: mappings, + ddm_inventory_drift: drift_count, + }) + } - // Sled not found after successful cache refresh. We treat this as an error - // so callers can surface this condition rather than silently applying - // no changes. - Err(anyhow::Error::msg(format!( - "failed to resolve sled to switch ports: \ - sled {sled_id} not found in mapping cache (not a scrimlet or removed)" - ))) + /// Look up switch ports for a sled in the reconciliation pass mapping. + fn resolve_sled_to_switch_ports( + sled_to_ports: &HashMap>, + sled_id: SledUuid, + ) -> Result, anyhow::Error> { + sled_to_ports.get(&sled_id).cloned().ok_or_else(|| { + anyhow::Error::msg(format!( + "sled {sled_id} not found in reconciliation pass sled \ + mapping (not in DDM peers or inventory)" + )) + }) } /// Find SP in inventory for a given sled's baseboard. @@ -2414,8 +2609,8 @@ impl MulticastGroupReconciler { sp_slot: u32, backplane_map: &BackplaneMap, ) -> Result>, anyhow::Error> { - let port_id = dpd_client::types::PortId::Rear( - dpd_client::types::Rear::try_from(format!("rear{sp_slot}")) + let port_id = PortId::Rear( + Rear::try_from(format!("rear{sp_slot}")) .context("invalid rear port number")?, ); @@ -2443,8 +2638,8 @@ impl MulticastGroupReconciler { Ok(Some(vec![SwitchBackplanePort { port_id, - link_id: dpd_client::types::LinkId(0), - direction: dpd_client::types::Direction::Underlay, + link_id: LinkId(0), + direction: Direction::Underlay, }])) } @@ -2514,12 +2709,95 @@ impl MulticastGroupReconciler { /// /// Where `entry.cubby` is the physical cubby/slot number (same as our `sp_slot`), /// and this maps it to a `PortId::Rear` that DPD can program on the Tofino ASIC. - async fn refresh_sled_mapping_cache( + /// Fetch the sled-to-port mapping from DDM peer topology. + /// + /// DDM peers provide live sled-to-port mapping via the `if_name` + /// field (e.g., `"tfportrear0_0"`, `"tfportqsfp0_0"`). More current + /// than inventory. + /// + /// Joins active DDM peers (by IPv6 address) against the in-service + /// sled list and parses each peer's `tfport_` + /// interface name into a [`SwitchBackplanePort`]. Any DPD port + /// variant (rear, qsfp, ...) is supported; direction is derived + /// from the port kind. Parsed `PortId`s are cross-validated against + /// the DPD backplane map: peers whose port is unknown to DPD are + /// dropped, so the prefix shape (`tfport`) is just a tokenizer and + /// correctness rides on DPD's authoritative port enumeration. + async fn fetch_sled_mapping_from_ddm( + &self, + opctx: &OpContext, + switch_zone_client: &MulticastSwitchZoneClient, + backplane_map: &BackplaneMap, + sleds: &[Sled], + ) -> Result>, anyhow::Error> + { + let peers = switch_zone_client + .get_ddm_peers() + .await + .context("failed to get DDM peers")?; + + let addr_to_sled: HashMap = sleds + .iter() + .map(|sled| (sled.ip(), SledUuid::from(sled.id()))) + .collect(); + + let mappings: HashMap> = peers + .iter() + .filter(|p| { + matches!( + p.status, + omicron_ddm_admin_client::types::PeerStatus::Active + ) + }) + .filter_map(|p| { + let if_name = p.if_name.as_ref()?; + let sled_id = *addr_to_sled.get(&p.addr)?; + let port = parse_ddm_if_name_to_port(if_name)?; + if !backplane_map.contains_key(&port.port_id) { + debug!( + opctx.log, + "dropping DDM peer: port_id not in DPD backplane map"; + "if_name" => %if_name, + "port_id" => %port.port_id, + ); + return None; + } + Some((sled_id, port)) + }) + .fold(HashMap::new(), |mut acc, (sled_id, port)| { + acc.entry(sled_id).or_default().push(port); + acc + }); + + if mappings.is_empty() { + return Err(anyhow::Error::msg( + "no sled-to-port mappings resolved from DDM peers", + )); + } + + debug!( + opctx.log, + "fetched sled mapping from DDM peers"; + "mapped_sleds" => mappings.len(), + ); + + Ok(mappings) + } + + /// Fetch the sled-to-port mapping from inventory (fallback). + /// + /// Used when DDM peer topology is unavailable. Joins the latest + /// inventory collection's SP records against the in-service sled + /// list, validating each `sp_slot` against the DPD backplane map + /// passed in by [`Self::build_sled_port_map`]. + async fn fetch_sled_mapping_from_inventory( &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, - ) -> Result<(), anyhow::Error> { - // Fetch required data + mut backplane_map: BackplaneMap, + sleds: &[Sled], + ) -> Result>, anyhow::Error> + { let inventory = self .datastore .inventory_get_latest_collection(opctx) @@ -2529,21 +2807,11 @@ impl MulticastGroupReconciler { anyhow::Error::msg("no inventory collection available") })?; - // First attempt with current backplane map - let mut backplane_map = - self.fetch_backplane_map(opctx, dataplane_client).await?; - - let sleds = self - .datastore - .sled_list_all_batched(opctx, SledFilter::InService) - .await - .context("failed to list in-service sleds for inventory mapping")?; - - // Build sled → port mappings - let (mut mappings, mut validation_failures) = self - .build_sled_mappings(opctx, &sleds, &inventory, &backplane_map)?; + let (mut mappings, mut validation_failures) = + self.build_sled_mappings(opctx, sleds, &inventory, &backplane_map)?; - // If we had validation failures, invalidate backplane cache and retry once + // Validation failures may indicate stale backplane data, so we refresh + // and retry once before reporting. if validation_failures > 0 { info!( opctx.log, @@ -2551,10 +2819,8 @@ impl MulticastGroupReconciler { "validation_failures" => validation_failures ); - // Invalidate the backplane cache self.invalidate_backplane_cache().await; - // Fetch fresh backplane map backplane_map = self .fetch_backplane_map(opctx, dataplane_client) .await @@ -2562,7 +2828,6 @@ impl MulticastGroupReconciler { "failed to fetch fresh backplane map after invalidation", )?; - // Retry mapping with fresh backplane data (mappings, validation_failures) = self.build_sled_mappings( opctx, &sleds, @@ -2570,7 +2835,6 @@ impl MulticastGroupReconciler { &backplane_map, )?; - // Log sleds that still fail with fresh backplane data if validation_failures > 0 { warn!( opctx.log, @@ -2580,16 +2844,11 @@ impl MulticastGroupReconciler { } } - // Update cache let sled_count = mappings.len(); - let mut cache = self.sled_mapping_cache.write().await; - *cache = (Instant::now(), mappings); - - // Log results if validation_failures > 0 { warn!( opctx.log, - "sled mapping cache refreshed with validation failures"; + "fetched sled mapping from inventory with validation failures"; "total_sleds" => sleds.len(), "mapped_sleds" => sled_count, "validation_failures" => validation_failures @@ -2597,31 +2856,42 @@ impl MulticastGroupReconciler { } else { info!( opctx.log, - "sled mapping cache refreshed successfully"; + "fetched sled mapping from inventory"; "total_sleds" => sleds.len(), "mapped_sleds" => sled_count ); } - Ok(()) + Ok(mappings) } /// Cleanup a member that is marked for deletion (time_deleted set). + /// + /// This includes unsubscribing a member from its VMM, removing + /// it from the dataplane, and hard-deleting the DB row. async fn cleanup_deleted_member( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { opctx, group, member, sled_client, .. } = ctx; + // Unsubscribe from sled-agent (best-effort, VMM may be gone). + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_instance(opctx, group, member, sled_id.into()) + .await + { + debug!( + opctx.log, + "failed to unsubscribe instance during member cleanup"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Use the consolidated cleanup helper with strict error handling - self.cleanup_member_from_dataplane( - opctx, - group, - member, - dataplane_client, - ) - .await + self.cleanup_member_from_dataplane(ctx).await } /// Get all multicast groups that need member reconciliation. @@ -2642,3 +2912,88 @@ impl MulticastGroupReconciler { ) } } + +/// Parse a DDM peer interface name (e.g., `"tfportrear0_0"`) into a +/// `SwitchBackplanePort` for sled-bound multicast member programming. +/// +/// The DDM peer `if_name` follows `tfport_`, where +/// `` is a DPD-recognized port name. This parser deliberately +/// rejects any non-rear `PortId`. In production, a sled's only +/// physical path to a switch is the rack backplane. +/// +/// TODO: Egress (uplink) members are not yet implemented. When they +/// land, they will come from group-level configuration applied +/// directly via DPD rather than from DDM peer discovery. See the +/// `TODO` in [`MulticastGroupReconciler::add_member_to_dataplane`]. +fn parse_ddm_if_name_to_port(if_name: &str) -> Option { + use std::str::FromStr; + + let stripped = if_name.strip_prefix("tfport")?; + let (port_str, link_str) = stripped.rsplit_once('_')?; + + let port_id = PortId::from_str(port_str).ok()?; + let PortId::Rear(_) = port_id else { + return None; + }; + let link_id = LinkId(link_str.parse::().ok()?); + + Some(SwitchBackplanePort { + port_id, + link_id, + direction: Direction::Underlay, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_valid_rear_port() { + let port = parse_ddm_if_name_to_port("tfportrear0_0").unwrap(); + assert_eq!( + port.port_id, + PortId::Rear(Rear::try_from("rear0".to_string()).unwrap()) + ); + assert_eq!(port.link_id, LinkId(0)); + assert_eq!(port.direction, Direction::Underlay); + } + + #[test] + fn parse_higher_port_number() { + let port = parse_ddm_if_name_to_port("tfportrear31_0").unwrap(); + assert_eq!( + port.port_id, + PortId::Rear(Rear::try_from("rear31".to_string()).unwrap()) + ); + } + + #[test] + fn parse_nonzero_link() { + let port = parse_ddm_if_name_to_port("tfportrear5_2").unwrap(); + assert_eq!(port.link_id, LinkId(2)); + } + + #[test] + fn parse_non_rear_port_returns_none() { + // Sleds only attach via rear ports; reject other variants. + assert!(parse_ddm_if_name_to_port("tfportqsfp0_0").is_none()); + } + + #[test] + fn parse_invalid_prefix_returns_none() { + assert!(parse_ddm_if_name_to_port("eth0").is_none()); + assert!(parse_ddm_if_name_to_port("").is_none()); + } + + #[test] + fn parse_missing_underscore_returns_none() { + assert!(parse_ddm_if_name_to_port("tfportrear0").is_none()); + } + + #[test] + fn parse_non_numeric_returns_none() { + assert!(parse_ddm_if_name_to_port("tfportrearX_0").is_none()); + assert!(parse_ddm_if_name_to_port("tfportrear0_Y").is_none()); + } +} diff --git a/nexus/src/app/background/tasks/multicast/mod.rs b/nexus/src/app/background/tasks/multicast/mod.rs index 8f592a41087..2f6ff6c0a0d 100644 --- a/nexus/src/app/background/tasks/multicast/mod.rs +++ b/nexus/src/app/background/tasks/multicast/mod.rs @@ -32,6 +32,8 @@ //! - Dataplane state convergence //! - Group and Member state checks and transitions ("Joining" → "Joined" → "Left") //! - Drift detection and correction +//! - Switch zone coordination: MRIB route programming through MGD, +//! peer topology lookups from DDM //! - Cleanup of orphaned resources //! //! ## Multicast Group Architecture @@ -84,7 +86,7 @@ //! - Unlike linear probing (`h + i`), scattered outputs avoid clustering //! - **8-bit salt**: 256 unique underlay addresses per external IP //! - **Resolution**: Exhaustion requires 256 other groups to occupy exactly -//! those 256 scattered addresses—effectively impossible in 2^64 space +//! those 256 scattered addresses, effectively impossible in 2^64 space //! //! ### Forwarding Architecture (Incoming multicast traffic to guests) //! @@ -105,7 +107,33 @@ //! - **Group lifecycle**: "Creating" → "Active" → "Deleting" → hard-deleted //! - **Member lifecycle**: "Joining" → "Joined" → "Left" → soft-deleted → hard-deleted //! - **Dataplane updates**: DPD API calls for P4 table updates -//! - **Topology mapping**: Sled-to-switch-port resolution (with caching) +//! - **MRIB programming**: multicast routing entries written through +//! MGD, diffed against a per-pass snapshot and withdrawn when no +//! "Joined" members remain so DDM peers stop sending traffic +//! - **Sled propagation**: M2P mappings and forwarding entries pushed to sled-agents +//! - **OPTE subscriptions**: Per-instance multicast group subscriptions +//! on target sleds (keyed at the sled by the active VMM's propolis-id) +//! - **Topology mapping**: Per-pass sled-to-switch-port resolution from +//! DDM peers (primary) or inventory + DPD backplane (fallback) +//! +//! ## RPW Saga Coordination +//! +//! The reconciler launches sagas for transactional operations +//! (e.g. external+underlay group ensure). By default sagas retry +//! independently and the next reconciler tick observes the resulting +//! state. +//! +//! For group creation, the reconciler instead drains saga completion +//! within the same pass so [`reconcile_member_states`] and +//! [`reconcile_active_groups`] can converge in one tick. The motivation +//! is operator-visible latency: members see multicast settle within a +//! single reconciler interval of joining, rather than waiting an +//! additional tick for the saga's effects to be observed. The drain is +//! bounded by the enclosing `buffer_unordered` concurrency, so multiple +//! groups still progress in parallel. +//! +//! This mirrors the `saga_run` + drain pattern used by +//! [`instance_reincarnation`] and [`instance_updater`]. //! //! ## Deletion Semantics: Groups vs Members //! @@ -126,9 +154,13 @@ //! - Cleanup task eventually hard-deletes the row //! //! [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346 +//! [`UNDERLAY_MULTICAST_SUBNET`]: omicron_common::address::UNDERLAY_MULTICAST_SUBNET +//! [`reconcile_member_states`]: MulticastGroupReconciler::reconcile_member_states +//! [`reconcile_active_groups`]: MulticastGroupReconciler::reconcile_active_groups +//! [`instance_reincarnation`]: crate::app::background::tasks::instance_reincarnation +//! [`instance_updater`]: crate::app::background::tasks::instance_updater -use std::collections::{BTreeMap, HashMap}; -use std::net::{IpAddr, Ipv6Addr}; +use std::collections::BTreeMap; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -138,27 +170,21 @@ use internal_dns_resolver::Resolver; use serde_json::json; use slog::{error, info}; use tokio::sync::RwLock; -use tokio::sync::watch::Receiver; use nexus_db_model::MulticastGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::MulticastGroupReconcilerStatus; -use nexus_types::inventory::{Collection, SpType}; -use omicron_common::address::UNDERLAY_MULTICAST_SUBNET; -use omicron_uuid_kinds::SledUuid; -use sled_hardware_types::BaseboardId; use crate::app::background::BackgroundTask; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; +use crate::app::multicast::switch_zone::MulticastSwitchZoneClient; use crate::app::saga::StartSaga; pub(crate) mod groups; pub(crate) mod members; - -/// Type alias for the sled mapping cache. -type SledMappingCache = - Arc>)>>; +mod mrib; /// Type alias for the backplane map cache. type BackplaneMapCache = Arc< @@ -187,7 +213,7 @@ pub(crate) enum StateTransition { } /// Switch port configuration for multicast group members. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct SwitchBackplanePort { /// Switch port ID pub port_id: dpd_client::types::PortId, @@ -203,12 +229,6 @@ pub(crate) struct MulticastGroupReconciler { datastore: Arc, resolver: Resolver, sagas: Arc, - /// Receiver for inventory updates from the inventory loader background task. - rx_inventory: Receiver>>, - /// Cache for sled-to-backplane-port mappings. - /// Maps sled_id → rear backplane ports for multicast traffic routing. - sled_mapping_cache: SledMappingCache, - sled_cache_ttl: Duration, /// Cache for backplane hardware topology from DPD. /// Maps PortId → BackplaneLink for platform-specific port validation. backplane_map_cache: BackplaneMapCache, @@ -219,12 +239,6 @@ pub(crate) struct MulticastGroupReconciler { group_concurrency_limit: usize, /// Whether multicast functionality is enabled (or not). enabled: bool, - /// Last seen sled baseboard→sp_slot mappings for cache invalidation. - /// - /// We track sled locations (keyed by baseboard identity), as sled - /// physical locations rarely change. Caches are only invalidated - /// when `sp_slot` values differ. - last_seen_sled_slots: HashMap, u16>, } impl MulticastGroupReconciler { @@ -232,27 +246,18 @@ impl MulticastGroupReconciler { datastore: Arc, resolver: Resolver, sagas: Arc, - rx_inventory: Receiver>>, enabled: bool, - sled_cache_ttl: Duration, backplane_cache_ttl: Duration, ) -> Self { Self { datastore, resolver, sagas, - rx_inventory, - sled_mapping_cache: Arc::new(RwLock::new(( - Instant::now(), - HashMap::new(), - ))), - sled_cache_ttl, backplane_map_cache: Arc::new(RwLock::new(None)), backplane_cache_ttl, member_concurrency_limit: 100, group_concurrency_limit: 100, enabled, - last_seen_sled_slots: HashMap::new(), } } @@ -273,174 +278,6 @@ impl MulticastGroupReconciler { let mut cache = self.backplane_map_cache.write().await; *cache = None; // Clear the cache entirely } - - /// Invalidate the sled mapping cache, forcing refresh on next access. - /// - /// Called when: - /// - Backplane topology changes detected (different port count/layout) - /// - Need to re-validate sled mappings against new topology - pub(crate) async fn invalidate_sled_mapping_cache(&self) { - let mut cache = self.sled_mapping_cache.write().await; - // Set timestamp to past to force refresh on next check - *cache = (Instant::now() - self.sled_cache_ttl, cache.1.clone()); - } - - /// Check if sled locations changed and invalidate caches if so. - /// - /// Compares actual serial→sp_slot mappings since sled locations rarely - /// change. Uses the inventory watch channel for cheap access to latest - /// inventory. - async fn check_sled_locations_for_cache_invalidation( - &mut self, - opctx: &OpContext, - ) { - // Get inventory from watch channel (cheap Arc::clone, no DB query) - let Some(inventory) = - self.rx_inventory.borrow_and_update().as_ref().map(Arc::clone) - else { - debug!( - opctx.log, - "skipping cache invalidation check: no inventory available" - ); - return; - }; - - // Build current baseboard→sp_slot mapping for sleds only - let current_sled_slots: HashMap, u16> = inventory - .sps - .iter() - .filter(|(_, sp)| sp.sp_type == SpType::Sled) - .map(|(baseboard, sp)| (Arc::clone(baseboard), sp.sp_slot)) - .collect(); - - if current_sled_slots != self.last_seen_sled_slots { - // Skip invalidation on first run (just initializing) - if !self.last_seen_sled_slots.is_empty() { - info!( - opctx.log, - "invalidating multicast caches due to sled location change"; - "previous_sled_count" => self.last_seen_sled_slots.len(), - "current_sled_count" => current_sled_slots.len() - ); - self.invalidate_backplane_cache().await; - self.invalidate_sled_mapping_cache().await; - } - self.last_seen_sled_slots = current_sled_slots; - } - } -} - -/// Maps an external multicast address to an underlay address in ff04::/64. -/// -/// Maps external addresses into [`UNDERLAY_MULTICAST_SUBNET`] (ff04::/64, -/// a subset of the admin-local scope ff04::/16 per RFC 7346) using XOR-fold. This prefix is static -/// for consistency across racks. -/// -/// See [RFC 7346] for IPv6 multicast admin-local scope. -/// -/// # Salt Parameter (Collision Avoidance) -/// -/// The `salt` enables collision avoidance via XOR perturbation. XOR is bijective: -/// distinct salts produce distinct outputs (since `a ⊕ b = a ⊕ c` implies `b = c`), -/// guaranteeing 256 unique addresses per external IP. -/// -/// This is mathematically equivalent to [binary probing] in hash table literature -/// (`h_i(x) := h(x) ⊕ i`), though the domain context differs in that we're mapping -/// into a sparse 2^64 IPv6 address space rather than probing array slots. -/// -/// ```text -/// Salt perturbation example (h = 0xa): -/// ┌──────┬─────────┬────────┐ -/// │ salt │ h ⊕ salt│ output │ -/// ├──────┼─────────┼────────┤ -/// │ 0 │ 0xa ⊕ 0 │ 0xa │ -/// │ 1 │ 0xa ⊕ 1 │ 0xb │ -/// │ 2 │ 0xa ⊕ 2 │ 0x8 │ -/// │ 3 │ 0xa ⊕ 3 │ 0x9 │ -/// │ 4 │ 0xa ⊕ 4 │ 0xe │ -/// │ 5 │ 0xa ⊕ 5 │ 0xf │ -/// │ 6 │ 0xa ⊕ 6 │ 0xc │ -/// │ 7 │ 0xa ⊕ 7 │ 0xd │ -/// └──────┴─────────┴────────┘ -/// Outputs: [a, b, 8, 9, e, f, c, d] — scattered, not sequential -/// ``` -/// -/// On collision (i.e., underlay IP already in use), we increment salt and retry. -/// This stores the successful salt with the group for deterministic -/// reconstruction. -/// -/// # Implementation -/// -/// ```text -/// underlay_ip = ff04:: | ((xor_fold(external_ip) ⊕ salt) & HOST_MASK) -/// ``` -/// -/// - IPv4: embedded directly (32 bits fits in 64-bit host space) -/// - IPv6: XOR upper and lower 64-bit halves to fold 128→64 bits -/// - Salt ∈ [0, 255]: XORed into host bits for collision retry -/// -/// The `& HOST_MASK` guarantees the result stays within ff04::/64, our static -/// underlay subnet. -/// -/// [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346 -/// [binary probing]: https://courses.grainger.illinois.edu/CS473/fa2025/notes/05-hashing.pdf -fn map_external_to_underlay_ip(external_ip: IpAddr, salt: u8) -> IpAddr { - // Derive constants from the default underlay multicast subnet - const HOST_BITS: u32 = 128 - UNDERLAY_MULTICAST_SUBNET.width() as u32; - let prefix_base = - u128::from_be_bytes(UNDERLAY_MULTICAST_SUBNET.addr().octets()); - - map_external_to_underlay_ip_impl(prefix_base, HOST_BITS, external_ip, salt) -} - -/// Core implementation: maps external multicast IP to underlay IPv6 address. -/// -/// Separated for testing purposes. -/// -/// Parameters: -/// - `prefix_base`: Network prefix as u128 (e.g., ff04:: → 0xff04_0000_...) -/// - `host_bits`: Number of host bits (e.g., 64 for a /64 prefix) -/// - `external_ip`: The external multicast address to map -/// - `salt`: XOR perturbation for collision avoidance (0-255) -/// -/// Returns: The mapped underlay IPv6 address -fn map_external_to_underlay_ip_impl( - prefix_base: u128, - host_bits: u32, - external_ip: IpAddr, - salt: u8, -) -> IpAddr { - let host_mask: u128 = - if host_bits >= 128 { u128::MAX } else { (1u128 << host_bits) - 1 }; - - // Derive host value from external IP - let host_value: u128 = match external_ip { - IpAddr::V4(ipv4) => { - // IPv4 (32 bits) fits directly in host space - u128::from(u32::from_be_bytes(ipv4.octets())) - } - IpAddr::V6(ipv6) => { - // XOR-fold 128 bits → host_bits (upper ^ lower). - // This ensures different external addresses (even with identical - // lower bits but different scopes) map to different underlay IPs. - let full = u128::from_be_bytes(ipv6.octets()); - if host_bits >= 128 { - full - } else { - (full >> host_bits) ^ (full & host_mask) - } - } - }; - - // XOR salt for collision avoidance retry, masked to stay in host bits. - // The salt is applied after folding, ensuring different salts produce - // different underlay IPs while staying within the prefix. - let salted = (host_value ^ u128::from(salt)) & host_mask; - - // Combine prefix + host (masking guarantees result stays in prefix) - let underlay = prefix_base | salted; - - IpAddr::V6(Ipv6Addr::from(underlay.to_be_bytes())) } impl BackgroundTask for MulticastGroupReconciler { @@ -513,7 +350,23 @@ impl MulticastGroupReconciler { trace!(opctx.log, "starting multicast reconciliation pass"); - self.check_sled_locations_for_cache_invalidation(opctx).await; + // Per-pass client construction policy: + // + // - DPD (dataplane): fail-closed. Required by every step. A + // pass without DPD has nothing useful to do. + // - sled-agent: never fails. The wrapper builds per-sled + // clients on demand, so construction is infallible. + // - MGD MRIB: fail-open. Only three steps are MRIB-coupled + // (member states, active reconciliation, deleting + // reconciliation). Creating-group reconciliation and the two + // cleanup steps run regardless. Subsequent passes retry the + // gated steps when MRIB returns. + // + // The non-gated cleanup steps never touch the dataplane. + // `cleanup_empty_groups` only marks "Deleting", and the terminal + // "Deleting" → "Deleted" transition lives in the gated + // `reconcile_deleting_groups`. A group therefore cannot vanish + // from the reconciler's view while its MRIB route still exists. // Create dataplane client (across switches) once for the entire // reconciliation pass (in case anything has changed) @@ -533,6 +386,35 @@ impl MulticastGroupReconciler { } }; + // Create sled-agent client for OPTE subscriptions and + // M2P/forwarding propagation. + let sled_client = MulticastSledClient::new( + self.datastore.clone(), + self.resolver.clone(), + ); + + // Create MGD MRIB client for multicast route distribution + // via DDM. `mg-lower` syncs MRIB changes to DDM automatically. + // + // Construction failure (e.g., transient DNS resolution returning + // no switch zones) skips MRIB-coupled work this pass but lets + // creating-group and cleanup paths progress. Subsequent passes + // will retry. + let switch_zone_client = match MulticastSwitchZoneClient::new( + self.resolver.clone(), + opctx.log.clone(), + ) + .await + { + Ok(client) => Some(client), + Err(e) => { + let msg = + format!("failed to create multicast MRIB client: {e:#}"); + status.errors.push(msg); + None + } + }; + // Process creating groups match self.reconcile_creating_groups(opctx).await { Ok(count) => status.groups_created += count, @@ -542,9 +424,25 @@ impl MulticastGroupReconciler { } } - // Process member state changes - match self.reconcile_member_states(opctx, &dataplane_client).await { - Ok(count) => status.members_processed += count, + // Process member state changes. The switch-zone client is optional. + // When absent, the per-pass sled-to-port map skips the DDM primary + // path and uses the inventory fallback. DB-only transitions + // ("Joining" → "Left") converge regardless. "Joining" → "Joined" + // transitions for stopped/migrating instances retries on the next pass + // once MGD/DDM are reachable. + match self + .reconcile_member_states( + opctx, + &dataplane_client, + &sled_client, + switch_zone_client.as_ref(), + ) + .await + { + Ok(counts) => { + status.members_processed += counts.processed; + status.ddm_inventory_drift += counts.ddm_inventory_drift; + } Err(e) => { let msg = format!("failed to reconcile member states: {e:#}"); status.errors.push(msg); @@ -573,22 +471,48 @@ impl MulticastGroupReconciler { } } - // Reconcile active groups (verify state, update dataplane as needed) - match self.reconcile_active_groups(opctx, &dataplane_client).await { - Ok(count) => status.groups_verified += count, - Err(e) => { - let msg = format!("failed to reconcile active groups: {e:#}"); - status.errors.push(msg); + // Reconcile active groups + if let Some(switch_zone_client) = &switch_zone_client { + match self + .reconcile_active_groups( + opctx, + &dataplane_client, + &sled_client, + switch_zone_client, + ) + .await + { + Ok(count) => status.groups_verified += count, + Err(e) => { + let msg = + format!("failed to reconcile active groups: {e:#}"); + status.errors.push(msg); + } } + } else { + status.skipped.push("reconcile_active_groups".to_string()); } - // Process deleting groups (DPD cleanup + hard-delete from DB) - match self.reconcile_deleting_groups(opctx, &dataplane_client).await { - Ok(count) => status.groups_deleted += count, - Err(e) => { - let msg = format!("failed to reconcile deleting groups: {e:#}"); - status.errors.push(msg); + // Process deleting groups + if let Some(switch_zone_client) = &switch_zone_client { + match self + .reconcile_deleting_groups( + opctx, + &dataplane_client, + &sled_client, + switch_zone_client, + ) + .await + { + Ok(count) => status.groups_deleted += count, + Err(e) => { + let msg = + format!("failed to reconcile deleting groups: {e:#}"); + status.errors.push(msg); + } } + } else { + status.skipped.push("reconcile_deleting_groups".to_string()); } trace!( @@ -609,11 +533,12 @@ impl MulticastGroupReconciler { #[cfg(test)] mod tests { - use super::*; - use std::collections::HashSet; - use std::net::{Ipv4Addr, Ipv6Addr}; + use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; + use crate::app::multicast::{ + map_external_to_underlay_ip, map_external_to_underlay_ip_impl, + }; use ipnet::Ipv6Net; use omicron_common::address::IPV6_ADMIN_SCOPED_MULTICAST_PREFIX; diff --git a/nexus/src/app/background/tasks/multicast/mrib.rs b/nexus/src/app/background/tasks/multicast/mrib.rs new file mode 100644 index 00000000000..e2e621ad0e7 --- /dev/null +++ b/nexus/src/app/background/tasks/multicast/mrib.rs @@ -0,0 +1,186 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! MRIB route reconciliation for active multicast groups. +//! +//! This diffs the desired switch MRIB state, derived from group, member, and +//! source filter records, against a per-pass snapshot fetched by the +//! caller, then issues add/remove RPCs to converge. Best-effort: +//! failures are logged and retried on the next reconciler pass. + +use std::collections::HashSet; +use std::net::{IpAddr, Ipv6Addr}; + +use slog::{debug, warn}; +use slog_error_chain::InlineErrorChain; +use uuid::Uuid; + +use nexus_db_model::{MulticastGroup, MulticastGroupMemberState}; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::datastore::multicast::members::SourceFilterState; +use nexus_types::identity::Resource; +use omicron_common::api::external::DataPageParams; +use omicron_uuid_kinds::{GenericUuid, MulticastGroupUuid}; + +use crate::app::multicast::switch_zone::{ + MribRouteIndex, MulticastSwitchZoneClient, +}; + +/// Reconcile MRIB routes for a single active group against the per-pass +/// switch snapshot. Withdraws routes when no "Joined" members remain so +/// peer sleds stop sending traffic. +pub(super) async fn reconcile_group( + opctx: &OpContext, + datastore: &DataStore, + switch_zone_client: &MulticastSwitchZoneClient, + mrib_route_index: Option<&MribRouteIndex>, + group: &MulticastGroup, + source_filter: &SourceFilterState, + underlay_group_id: Uuid, +) { + let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); + + let members = match datastore + .multicast_group_members_list( + opctx, + group_id, + &DataPageParams::max_page(), + ) + .await + { + Ok(m) => m, + Err(e) => { + warn!( + opctx.log, + "failed to list members for MRIB reconcile, skipping"; + "group_id" => %group.id(), + "error" => InlineErrorChain::new(&e), + ); + return; + } + }; + let has_joined = + members.iter().any(|m| m.state == MulticastGroupMemberState::Joined); + + let underlay_group = match datastore + .underlay_multicast_group_fetch(opctx, underlay_group_id) + .await + { + Ok(g) => g, + Err(e) => { + warn!( + opctx.log, + "failed to fetch underlay group for MRIB reconcile, skipping"; + "group_id" => %group.id(), + "underlay_group_id" => %underlay_group_id, + "error" => InlineErrorChain::new(&e), + ); + return; + } + }; + + let IpAddr::V6(underlay_ip) = underlay_group.multicast_ip.ip() else { + warn!( + opctx.log, + "underlay multicast group has non-IPv6 address"; + "group_id" => %group.id(), + "underlay_ip" => %underlay_group.multicast_ip.ip(), + ); + return; + }; + + converge_routes( + opctx, + switch_zone_client, + mrib_route_index, + group, + source_filter, + underlay_ip, + has_joined, + ) + .await; +} + +/// Diff the per-pass MRIB snapshot against the desired route set and +/// issue add/remove RPCs to converge. +async fn converge_routes( + opctx: &OpContext, + switch_zone_client: &MulticastSwitchZoneClient, + mrib_route_index: Option<&MribRouteIndex>, + group: &MulticastGroup, + source_filter: &SourceFilterState, + underlay_ip: Ipv6Addr, + has_joined: bool, +) { + let group_ip = group.multicast_ip.ip(); + let current = mrib_route_index + .and_then(|index| index.get(&group_ip)) + .cloned() + .unwrap_or_default(); + let current_sources = current.keys().copied().collect::>(); + let desired: HashSet> = if has_joined { + source_filter + .specific_sources + .iter() + .map(|s| Some(*s)) + .chain(source_filter.has_any_source_member.then_some(None)) + .collect() + } else { + HashSet::new() + }; + + // Ensure desired routes exist. + for source in &desired { + let current_switches = current.get(source).cloned().unwrap_or_default(); + if current_switches.len() == switch_zone_client.switch_count() + && current_switches.values().all(|c| *c == underlay_ip) + { + continue; + } + if let Err(e) = + switch_zone_client.add_route(group_ip, underlay_ip, *source).await + { + warn!( + opctx.log, + "failed to ensure MRIB route"; + "group_id" => %group.id(), + "source" => ?source, + "error" => %e, + ); + } + } + + // Remove routes no longer desired. The per-pass snapshot lets us + // reconcile against current switch state without per-group RPCs. + for source in current_sources.difference(&desired) { + if let Err(e) = switch_zone_client.remove_route(group_ip, *source).await + { + warn!( + opctx.log, + "failed to remove stale MRIB route"; + "group_id" => %group.id(), + "source" => ?source, + "error" => %e, + ); + } + } + + // Surface RPF flux for diagnostics. The route lands in `mrib_in` + // after `add_route` but only flows once promoted to `mrib_loc`. + for source in &desired { + if !switch_zone_client + .route_active_on_all_switches(group_ip, *source) + .await + { + debug!( + opctx.log, + "MRIB route not yet RPF-verified on all switches"; + "group_id" => %group.id(), + "group_ip" => %group_ip, + "source" => ?source, + ); + } + } +} diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index d24a401b317..6792a4ff843 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1007,12 +1007,11 @@ impl super::Nexus { { if let (InstanceStateChangeError::SledAgent(inner), Some(vmm)) = (&e, state.vmm()) + && inner.vmm_gone() { - if inner.vmm_gone() { - let _ = self - .mark_vmm_failed(opctx, authz_instance, vmm, inner) - .await; - } + let _ = self + .mark_vmm_failed(opctx, authz_instance, vmm, inner) + .await; } return Err(e); @@ -1097,20 +1096,6 @@ impl super::Nexus { ) .await?; - // Update multicast member state for this instance to "Left" and clear - // `sled_id` - only if multicast is enabled - if self.multicast_enabled() { - self.db_datastore - .multicast_group_members_detach_by_instance( - opctx, - InstanceUuid::from_untyped_uuid(authz_instance.id()), - ) - .await?; - } - - // Activate multicast reconciler to handle switch-level changes - self.background_tasks.task_multicast_reconciler.activate(); - if let Err(e) = self .instance_request_state( opctx, @@ -1122,17 +1107,30 @@ impl super::Nexus { { if let (InstanceStateChangeError::SledAgent(inner), Some(vmm)) = (&e, state.vmm()) + && inner.vmm_gone() { - if inner.vmm_gone() { - let _ = self - .mark_vmm_failed(opctx, authz_instance, vmm, inner) - .await; - } + let _ = self + .mark_vmm_failed(opctx, authz_instance, vmm, inner) + .await; } return Err(e); } + // Detach multicast members (state -> "Left", clear `sled_id`) only + // after sled-agent has acknowledged the Stop request. Doing it + // before the request would tear down M2P/forwarding for a guest + // that is still running if the request fails. + if self.multicast_enabled() { + self.db_datastore + .multicast_group_members_detach_by_instance( + opctx, + InstanceUuid::from_untyped_uuid(authz_instance.id()), + ) + .await?; + self.background_tasks.task_multicast_reconciler.activate(); + } + self.db_datastore .instance_fetch_with_vmm(opctx, &authz_instance) .await diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 40c762b1c85..679a7b9f342 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -33,7 +33,6 @@ use nexus_types::deployment::PendingMgsUpdates; use nexus_types::deployment::ReconfiguratorConfigParam; use omicron_common::address::MGD_PORT; -use omicron_common::address::MGS_PORT; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Error; use omicron_uuid_kinds::OmicronZoneUuid; @@ -1246,6 +1245,15 @@ pub(crate) async fn dpd_clients( } }; + // Per-request bounds so a stalled DPD connection can't hang an RPW + // iteration or saga action indefinitely. Matches the timeout pair on + // the shared Nexus `reqwest_client`. + let reqwest_client = reqwest::ClientBuilder::new() + .connect_timeout(std::time::Duration::from_secs(15)) + .timeout(std::time::Duration::from_secs(15)) + .build() + .map_err(|e| format!("failed to build DPD reqwest client: {e}"))?; + let clients: Vec<(SocketAddrV6, dpd_client::Client)> = dpd_socketaddrs .iter() .map(|socket_addr| { @@ -1256,8 +1264,9 @@ pub(crate) async fn dpd_clients( )), }; - let client = dpd_client::Client::new( + let client = dpd_client::Client::new_with_client( &format!("http://{socket_addr}"), + reqwest_client.clone(), client_state, ); @@ -1325,29 +1334,28 @@ pub(crate) async fn lldpd_clients( Ok(clients) } -/// Look up Dendrite addresses in DNS then determine the switch location of -/// any addresses we're able to resolve the SwitchSlot for. If a switch -/// zone is down, the resolution process will fail and the entry will be -/// missing from the result. +#[derive(Clone, Debug)] +pub(crate) struct SwitchZoneTarget { + pub(crate) target: String, + pub(crate) addr: Ipv6Addr, +} + +/// Look up switch zones in DNS, then determine the switch location of any +/// zones we're able to resolve the `SwitchSlot` for. If a switch zone is down, +/// the resolution process will fail and the entry will be missing from the +/// result. /// /// # Errors -/// If we fail to resolve the ipv6 addresses of the Dendrite service we -/// return an error +/// If we fail to resolve the MGS SRV records for switch zones, return an error. async fn switch_zone_address_mappings( resolver: &internal_dns_resolver::Resolver, log: &slog::Logger, ) -> Result, String> { - let switch_zone_addresses = match resolver - .lookup_all_ipv6(ServiceName::Dendrite) - .await - { - Ok(addrs) => addrs, - Err(e) => { - error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e); - return Err(e.to_string()); - } - }; - Ok(map_switch_zone_addrs(&log, switch_zone_addresses, resolver).await) + Ok(switch_zone_targets(resolver, log) + .await? + .into_iter() + .map(|(slot, endpoint)| (slot, endpoint.addr)) + .collect()) } // TODO: #3596 Allow updating of Nexus from `handoff_to_nexus()` @@ -1359,40 +1367,52 @@ async fn switch_zone_address_mappings( // up switch addresses as a whole, since how DNS is currently setup for // Dendrite is insufficient for what we need. // -/// Query MGS in each switch zone to learn which switch slot is being managed by -/// the services located on a given ipv6 address. This information can be used -/// along with the well known port numbers to target a specific switch + service -/// combination. +/// Query MGS in each switch zone to learn which switch slot is managed by each +/// service target. /// /// We return whatever we're able to successfully resolve. In the event of -/// a communication timeout or other failure with MGS, the SwitchSlot -> Ipv6Addr -/// mapping will be missing from the returned HashMap. Callers will need to inspect +/// a communication timeout or other failure with MGS, the corresponding entry +/// will be missing from the returned `HashMap`. Callers will need to inspect /// the contents to ensure what they expect to be there is actually there. -async fn map_switch_zone_addrs( - log: &Logger, - switch_zone_addresses: Vec, +pub(crate) async fn switch_zone_targets( resolver: &internal_dns_resolver::Resolver, -) -> HashMap { + log: &Logger, +) -> Result, String> { use gateway_client::Client as MgsClient; + info!(log, "Determining switch slots managed by switch zones"); - let mut switch_zone_addrs = HashMap::new(); - - for addr in switch_zone_addresses { - let port = match resolver - .lookup_all_socket_v6(ServiceName::ManagementGatewayService) - .await - { - Ok(addrs) => { - let port_map: HashMap = addrs - .into_iter() - .map(|sockaddr| (*sockaddr.ip(), sockaddr.port())) - .collect(); - - *port_map.get(&addr).unwrap_or(&MGS_PORT) + let mgs_targets = match resolver + .lookup_srv(ServiceName::ManagementGatewayService) + .await + { + Ok(targets) => targets, + Err(e) => { + error!(log, "failed to resolve MGS service targets"; "error" => %e); + return Err(e.to_string()); + } + }; + + let mut switch_zone_targets = HashMap::new(); + + for (target, port) in mgs_targets { + let addr = match resolver.ipv6_lookup(&target).await { + Ok(Some(addr)) => addr, + Ok(None) => { + warn!( + log, + "MGS SRV target resolved without an IPv6 address"; + "target" => &target, + ); + continue; } Err(e) => { - error!(log, "failed to resolve MGS addresses"; "error" => %e); - MGS_PORT + warn!( + log, + "failed to resolve IPv6 address for MGS target"; + "target" => &target, + "error" => %e, + ); + continue; } }; @@ -1401,14 +1421,22 @@ async fn map_switch_zone_addrs( log.new(o!("component" => "MgsClient")), ); - info!(log, "determining switch slot managed by switch zone"; "zone_address" => #?addr); + info!( + log, + "determining switch slot managed by switch zone"; + "target" => &target, + "zone_address" => #?addr, + "mgs_port" => port, + ); let switch_slot = match mgs_client.sp_local_switch_id().await { Ok(switch) => { info!( log, "identified switch slot for switch zone"; "slot" => #?switch, - "zone_address" => #?addr + "target" => &target, + "zone_address" => #?addr, + "mgs_port" => port, ); switch.slot } @@ -1416,19 +1444,22 @@ async fn map_switch_zone_addrs( error!( log, "failed to identify switch slot for switch zone"; + "target" => &target, "zone_address" => #?addr, + "mgs_port" => port, "reason" => #?e ); continue; } }; + let endpoint = SwitchZoneTarget { target, addr }; match switch_slot { 0 => { - switch_zone_addrs.insert(SwitchSlot::Switch0, addr); + switch_zone_targets.insert(SwitchSlot::Switch0, endpoint); } 1 => { - switch_zone_addrs.insert(SwitchSlot::Switch1, addr); + switch_zone_targets.insert(SwitchSlot::Switch1, endpoint); } _ => { warn!( @@ -1442,10 +1473,10 @@ async fn map_switch_zone_addrs( info!( log, "completed mapping switch zones to switch slots"; - "mappings" => #?switch_zone_addrs + "mappings" => #?switch_zone_targets ); - switch_zone_addrs + Ok(switch_zone_targets) } /// Begin configuring an external HTTP client, returning a diff --git a/nexus/src/app/multicast/dataplane.rs b/nexus/src/app/multicast/dataplane.rs index 5d79df7d078..777f8f9f02e 100644 --- a/nexus/src/app/multicast/dataplane.rs +++ b/nexus/src/app/multicast/dataplane.rs @@ -40,6 +40,7 @@ use std::collections::HashMap; use std::net::IpAddr; +use std::time::Duration; use futures::future::try_join_all; use oxnet::MulticastMac; @@ -113,7 +114,8 @@ trait IntoUnderlayMulticast { impl IntoUnderlayMulticast for IpAddr { fn into_underlay_multicast(self) -> Result { match self { - IpAddr::V6(ipv6) => Ok(UnderlayMulticastIpv6(ipv6)), + IpAddr::V6(ipv6) => UnderlayMulticastIpv6::try_from(ipv6) + .map_err(|e| Error::invalid_request(e.to_string())), IpAddr::V4(_) => Err(Error::invalid_request( "underlay multicast groups must use IPv6 addresses", )), @@ -138,7 +140,7 @@ pub(crate) type MulticastDataplaneResult = Result; /// - Group-level uplink configuration (which front ports to use) /// - Uplink members with [`dpd_client::types::Direction::External`] added to /// underlay groups -/// - Integration with existing `switch_ports_with_uplinks()` for port discovery +/// - Integration with existing `switch_ports_with_uplinks` for port discovery pub(crate) struct MulticastDataplaneClient { dpd_clients: HashMap, log: Logger, @@ -153,6 +155,15 @@ pub(crate) struct GroupUpdateParams<'a> { pub source_filter: &'a SourceFilterState, } +/// Bound DPD client construction. On timeout (or DNS failure) we yield +/// an empty client map rather than failing the pass: group operations +/// skip with no switches, but DB-only member-state transitions +/// ("Joining" → "Left" when the instance is stopped) still proceed. +const DPD_CLIENT_BUILD_TIMEOUT: Duration = + // Caps the internal-DNS retry budget for `_dendrite._tcp` so a DPD + // outage doesn't starve the bg task's idle window. + Duration::from_secs(5); + impl MulticastDataplaneClient { /// Create a new client - builds fresh DPD clients for current switch /// topology. @@ -160,31 +171,72 @@ impl MulticastDataplaneClient { resolver: Resolver, log: Logger, ) -> MulticastDataplaneResult { - let dpd_clients = dpd_clients(&resolver, &log).await.map_err(|e| { - error!( - log, - "failed to build DPD clients"; - "error" => %e - ); - Error::internal_error("failed to build DPD clients") - })?; + let dpd_clients = match tokio::time::timeout( + DPD_CLIENT_BUILD_TIMEOUT, + dpd_clients(&resolver, &log), + ) + .await + { + Ok(Ok(clients)) => clients, + Ok(Err(e)) => { + error!( + log, + "failed to build DPD clients, continuing with empty \ + client map"; + "error" => %e, + ); + HashMap::new() + } + Err(_) => { + error!( + log, + "timed out building DPD clients, continuing with empty \ + client map"; + "timeout" => ?DPD_CLIENT_BUILD_TIMEOUT, + ); + HashMap::new() + } + }; Ok(Self { dpd_clients, log }) } - /// Select a single switch deterministically for read operations. + /// Iterate switches in deterministic (sorted by `SwitchSlot`) order. /// - /// Used when all switches should have identical state and we only need - /// to query one. Selects the first switch in sorted order by location - /// for consistency across invocations. - fn select_one_switch( + /// Used by read paths that need data from any one switch (since all + /// switches hold identical state for that read). Callers walk this + /// iterator and short-circuit on the first success, falling through + /// to subsequent switches on per-switch failure so a single + /// unhealthy switch doesn't fail the whole operation. + fn switches_in_order( &self, - ) -> MulticastDataplaneResult<(&SwitchSlot, &dpd_client::Client)> { - let mut switches: Vec<_> = self.dpd_clients.iter().collect(); - switches.sort_by_key(|(loc, _)| *loc); - switches - .into_iter() - .next() - .ok_or_else(|| Error::internal_error("no DPD clients available")) + ) -> impl Iterator { + let mut entries: Vec<_> = self.dpd_clients.iter().collect(); + entries.sort_by_key(|(slot, _)| *slot); + entries.into_iter() + } + + /// Compute DPD source filter from aggregated member source state. + /// + /// For SSM addresses, always returns specific sources. For ASM addresses, + /// returns `None` (any source) if any member omitted sources, otherwise + /// returns the union of all member sources. + fn compute_sources_for_dpd( + external_group_ip: IpAddr, + source_filter: &SourceFilterState, + ) -> Option> { + if is_ssm_address(external_group_ip) + || !source_filter.has_any_source_member + { + Some( + source_filter + .specific_sources + .iter() + .map(|ip| dpd_client::types::IpSrc::Exact(*ip)) + .collect(), + ) + } else { + None + } } async fn dpd_ensure_underlay_created( @@ -413,33 +465,9 @@ impl MulticastDataplaneClient { inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, vni: Vni::from(u32::from(external_group.vni.0)), }; - let external_group_ip = external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if source_filter.has_any_source_member { - None - } else { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = + Self::compute_sources_for_dpd(external_group_ip, source_filter); let create_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { @@ -570,36 +598,12 @@ impl MulticastDataplaneClient { inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, vni: Vni::from(u32::from(params.external_group.vni.0)), }; - let new_name_str = params.new_name.to_string(); let external_group_ip = params.external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if params.source_filter.has_any_source_member { - None - } else { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = Self::compute_sources_for_dpd( + external_group_ip, + params.source_filter, + ); let update_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { @@ -1028,9 +1032,13 @@ impl MulticastDataplaneClient { /// Detect and log cross-switch drift for multicast groups. /// - /// We logs errors if: + /// Detection-only. Logs errors when: /// - Group is present on some switches but missing on others (presence drift) /// - Group has different configurations across switches (config drift) + /// + /// Drift correction is handled separately by the active-group reconciler + /// (`groups.rs::reconcile_active_groups`), which re-pushes the + /// authoritative DB state to all switches on the next pass. fn log_drift_issues<'a>( &self, group_ip: IpAddr, @@ -1077,9 +1085,11 @@ impl MulticastDataplaneClient { /// Fetch external multicast group DPD state for RPW drift detection. /// /// Queries all switches to detect configuration drift. If any switch has - /// different state (missing group, different config), it will return the - /// found state, so the reconciler can initiate an UPDATE - /// saga that will fix all switches atomically. + /// different state (missing group, different config), returns the found + /// state so the reconciler can re-issue the dataplane operations on the + /// next pass and converge to the intended configuration. Drift repair + /// follows the RPW convergence model rather than an atomic cross-switch + /// saga, so callers should expect N-pass convergence on partial failure. pub(crate) async fn fetch_external_group_for_drift_check( &self, group_ip: IpAddr, @@ -1190,63 +1200,65 @@ impl MulticastDataplaneClient { dpd_client::types::BackplaneLink, >, > { - let (switch_slot, client) = self.select_one_switch()?; - - debug!( - self.log, - "fetching backplane map from DPD for topology validation"; - "switch" => ?switch_slot, - "query_scope" => "single_switch", - "dpd_operation" => "fetch_backplane_map" - ); - - match client.backplane_map().await { - Ok(response) => { - let backplane_map_raw = response.into_inner(); + let mut errors: Vec<(SwitchSlot, String)> = Vec::new(); + for (switch_slot, client) in self.switches_in_order() { + debug!( + self.log, + "fetching backplane map from DPD for topology validation"; + "switch" => ?switch_slot, + "dpd_operation" => "fetch_backplane_map" + ); - // Convert HashMap to BTreeMap - // DPD returns string keys like "rear0", "rear1" - parse them to PortId - let backplane_map: std::collections::BTreeMap<_, _> = backplane_map_raw - .into_iter() - .filter_map(|(port_str, link)| { - match dpd_client::types::PortId::try_from(port_str.as_str()) { - Ok(port_id) => Some((port_id, link)), - Err(e) => { - error!( - self.log, - "failed to parse port ID from backplane map"; - "port_str" => %port_str, - "error" => %e, - "dpd_operation" => "fetch_backplane_map" - ); - None + match client.backplane_map().await { + Ok(response) => { + let backplane_map_raw = response.into_inner(); + + // Convert HashMap to BTreeMap. + // DPD returns string keys like "rear0", "rear1"; parse them to PortId. + let backplane_map: std::collections::BTreeMap<_, _> = backplane_map_raw + .into_iter() + .filter_map(|(port_str, link)| { + match dpd_client::types::PortId::try_from(port_str.as_str()) { + Ok(port_id) => Some((port_id, link)), + Err(e) => { + error!( + self.log, + "failed to parse port ID from backplane map"; + "port_str" => %port_str, + "error" => %e, + "dpd_operation" => "fetch_backplane_map" + ); + None + } } - } - }) - .collect(); + }) + .collect(); - debug!( - self.log, - "backplane map fetched from DPD"; - "switch" => ?switch_slot, - "port_count" => backplane_map.len(), - "dpd_operation" => "fetch_backplane_map" - ); - Ok(backplane_map) - } - Err(e) => { - error!( - self.log, - "backplane map fetch failed"; - "switch" => ?switch_slot, - "error" => %e, - "dpd_operation" => "fetch_backplane_map" - ); - Err(Error::internal_error(&format!( - "failed to fetch backplane map from DPD: {e}" - ))) + debug!( + self.log, + "backplane map fetched from DPD"; + "switch" => ?switch_slot, + "port_count" => backplane_map.len(), + "dpd_operation" => "fetch_backplane_map" + ); + return Ok(backplane_map); + } + Err(e) => { + warn!( + self.log, + "backplane map fetch failed on switch, trying next"; + "switch" => ?switch_slot, + "error" => %e, + "dpd_operation" => "fetch_backplane_map" + ); + errors.push((*switch_slot, format!("{e}"))); + } } } + + Err(Error::internal_error(&format!( + "failed to fetch backplane map from any switch: {errors:?}", + ))) } /// Fetch current underlay group members from a single switch. @@ -1261,60 +1273,63 @@ impl MulticastDataplaneClient { &self, underlay_ip: IpAddr, ) -> MulticastDataplaneResult>> { - let (switch_slot, client) = self.select_one_switch()?; - - debug!( - self.log, - "fetching underlay group members from DPD for drift detection"; - "underlay_ip" => %underlay_ip, - "switch" => ?switch_slot, - "dpd_operation" => "fetch_underlay_members" - ); + let mut errors: Vec<(SwitchSlot, String)> = Vec::new(); + for (switch_slot, client) in self.switches_in_order() { + debug!( + self.log, + "fetching underlay group members from DPD for drift detection"; + "underlay_ip" => %underlay_ip, + "switch" => ?switch_slot, + "dpd_operation" => "fetch_underlay_members" + ); - match client - .multicast_group_get_underlay( - &underlay_ip.into_underlay_multicast()?, - ) - .await - { - Ok(response) => { - let members = response.into_inner().members; - debug!( - self.log, - "underlay group members fetched from DPD"; - "underlay_ip" => %underlay_ip, - "switch" => ?switch_slot, - "member_count" => members.len(), - "dpd_operation" => "fetch_underlay_members" - ); - Ok(Some(members)) - } - Err(DpdError::ErrorResponse(resp)) - if resp.status() == reqwest::StatusCode::NOT_FOUND => + match client + .multicast_group_get_underlay( + &underlay_ip.into_underlay_multicast()?, + ) + .await { - debug!( - self.log, - "underlay group not found on switch"; - "underlay_ip" => %underlay_ip, - "switch" => ?switch_slot, - "dpd_operation" => "fetch_underlay_members" - ); - Ok(None) - } - Err(e) => { - error!( - self.log, - "underlay group fetch failed"; - "underlay_ip" => %underlay_ip, - "switch" => ?switch_slot, - "error" => %e, - "dpd_operation" => "fetch_underlay_members" - ); - Err(Error::internal_error(&format!( - "failed to fetch underlay group from DPD: {e}" - ))) + Ok(response) => { + let members = response.into_inner().members; + debug!( + self.log, + "underlay group members fetched from DPD"; + "underlay_ip" => %underlay_ip, + "switch" => ?switch_slot, + "member_count" => members.len(), + "dpd_operation" => "fetch_underlay_members" + ); + return Ok(Some(members)); + } + Err(DpdError::ErrorResponse(resp)) + if resp.status() == reqwest::StatusCode::NOT_FOUND => + { + debug!( + self.log, + "underlay group not found on switch"; + "underlay_ip" => %underlay_ip, + "switch" => ?switch_slot, + "dpd_operation" => "fetch_underlay_members" + ); + return Ok(None); + } + Err(e) => { + warn!( + self.log, + "underlay group fetch failed on switch, trying next"; + "underlay_ip" => %underlay_ip, + "switch" => ?switch_slot, + "error" => %e, + "dpd_operation" => "fetch_underlay_members" + ); + errors.push((*switch_slot, format!("{e}"))); + } } } + + Err(Error::internal_error(&format!( + "failed to fetch underlay group from any switch: {errors:?}", + ))) } pub(crate) async fn remove_groups( diff --git a/nexus/src/app/multicast/mod.rs b/nexus/src/app/multicast/mod.rs index 629d1253c89..4a49ec095b2 100644 --- a/nexus/src/app/multicast/mod.rs +++ b/nexus/src/app/multicast/mod.rs @@ -47,7 +47,7 @@ //! //! [`UNDERLAY_MULTICAST_SUBNET`]: omicron_common::address::UNDERLAY_MULTICAST_SUBNET -use std::net::IpAddr; +use std::net::{IpAddr, Ipv6Addr}; use std::sync::Arc; use ref_cast::RefCast; @@ -61,7 +61,7 @@ use nexus_db_queries::db::datastore::multicast::ExternalMulticastGroupWithSource use nexus_db_queries::{authz, db}; use nexus_types::external_api::multicast; use nexus_types::multicast::MulticastGroupCreate; -use omicron_common::address::is_ssm_address; +use omicron_common::address::{UNDERLAY_MULTICAST_SUBNET, is_ssm_address}; use omicron_common::api::external::{ self, CreateResult, DataPageParams, DeleteResult, IdentityMetadataCreateParams, ListResultVec, LookupResult, @@ -70,6 +70,8 @@ use omicron_common::api::external::{ use omicron_uuid_kinds::{GenericUuid, InstanceUuid, MulticastGroupUuid}; pub(crate) mod dataplane; +pub(crate) mod sled; +pub(crate) mod switch_zone; /// Validate that SSM addresses have source IPs. /// @@ -858,6 +860,76 @@ fn generate_group_name_from_ip( }) } +/// Maps an external multicast address to an underlay address in ff04::/64. +/// +/// Maps external addresses into [`UNDERLAY_MULTICAST_SUBNET`] (ff04::/64, +/// a subset of the admin-local scope ff04::/16 per RFC 7346) using XOR-fold. +/// This prefix is static for consistency across racks. +/// +/// See [RFC 7346] for IPv6 multicast admin-local scope. +/// +/// # Salt Parameter (Collision Avoidance) +/// +/// The `salt` enables collision avoidance via XOR perturbation. XOR is +/// bijective: distinct salts produce distinct outputs (since +/// `a ^ b = a ^ c` implies `b = c`), guaranteeing 256 unique addresses +/// per external IP. +/// +/// On collision (underlay IP already in use), the caller increments +/// salt and retries. The successful salt is stored with the group for +/// deterministic reconstruction. +/// +/// # Implementation +/// +/// ```text +/// underlay_ip = ff04:: | ((xor_fold(external_ip) ^ salt) & HOST_MASK) +/// ``` +/// +/// - IPv4: embedded directly (32 bits fits in 64-bit host space) +/// - IPv6: XOR upper and lower 64-bit halves to fold 128 to 64 bits +/// - Salt in [0, 255]: XORed into host bits for collision retry +/// +/// The `& HOST_MASK` guarantees the result stays within ff04::/64. +/// +/// [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346 +pub(crate) fn map_external_to_underlay_ip( + external_ip: IpAddr, + salt: u8, +) -> IpAddr { + const HOST_BITS: u32 = 128 - UNDERLAY_MULTICAST_SUBNET.width() as u32; + let prefix_base = + u128::from_be_bytes(UNDERLAY_MULTICAST_SUBNET.addr().octets()); + + map_external_to_underlay_ip_impl(prefix_base, HOST_BITS, external_ip, salt) +} + +/// Core implementation separated for testing with custom prefix/host_bits. +pub(crate) fn map_external_to_underlay_ip_impl( + prefix_base: u128, + host_bits: u32, + external_ip: IpAddr, + salt: u8, +) -> IpAddr { + let host_mask: u128 = + if host_bits >= 128 { u128::MAX } else { (1u128 << host_bits) - 1 }; + + let host_value: u128 = match external_ip { + IpAddr::V4(ipv4) => u128::from(u32::from_be_bytes(ipv4.octets())), + IpAddr::V6(ipv6) => { + let full = u128::from_be_bytes(ipv6.octets()); + if host_bits >= 128 { + full + } else { + (full >> host_bits) ^ (full & host_mask) + } + } + }; + + let salted = (host_value ^ u128::from(salt)) & host_mask; + let underlay = prefix_base | salted; + IpAddr::V6(Ipv6Addr::from(underlay.to_be_bytes())) +} + #[cfg(test)] mod tests { use super::*; @@ -887,4 +959,32 @@ mod tests { 0xff1e, 0, 0, 0, 0, 0, 0, 1 )))); } + + #[test] + fn test_generate_group_name_from_ip() { + let v4 = IpAddr::V4(Ipv4Addr::new(224, 1, 2, 3)); + assert_eq!( + generate_group_name_from_ip(v4).unwrap().as_str(), + "mcast-224-1-2-3" + ); + + let v4_zeros = IpAddr::V4(Ipv4Addr::new(224, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v4_zeros).unwrap().as_str(), + "mcast-224-0-0-1" + ); + + let v6: IpAddr = IpAddr::V6(Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v6).unwrap().as_str(), + "mcast-ff0e-0-0-0-0-0-0-1" + ); + + let v6_ssm: IpAddr = + IpAddr::V6(Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0xabcd)); + assert_eq!( + generate_group_name_from_ip(v6_ssm).unwrap().as_str(), + "mcast-ff3e-0-0-0-0-0-0-abcd" + ); + } } diff --git a/nexus/src/app/multicast/sled.rs b/nexus/src/app/multicast/sled.rs new file mode 100644 index 00000000000..df66c2d2bd3 --- /dev/null +++ b/nexus/src/app/multicast/sled.rs @@ -0,0 +1,595 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Sled-agent multicast operations for OPTE subscriptions, M2P mappings, +//! and forwarding entries. +//! +//! Parallel to [`dataplane`] which handles DPD switch operations, this +//! module manages sled-local multicast state via sled-agent: +//! +//! - **OPTE subscriptions**: Per-VMM multicast group filters on the +//! hosting sled +//! - **M2P mappings**: Overlay multicast IP to underlay IPv6 address +//! translation, installed on all sleds +//! - **Forwarding entries**: Underlay multicast address to switch next-hop, +//! installed on all sleds so OPTE forwards to the switch for replication +//! +//! [`dataplane`]: super::dataplane + +use std::collections::hash_map::DefaultHasher; +use std::collections::{BTreeSet, HashMap}; +use std::hash::{Hash, Hasher}; +use std::net::{IpAddr, Ipv6Addr}; +use std::sync::Arc; + +use anyhow::Context; +use futures::future::join_all; +use omicron_common::api::external; +use sled_agent_types::early_networking::SwitchSlot; +use slog::{debug, info, warn}; + +use nexus_db_model::{ + MulticastGroup, MulticastGroupMember, MulticastGroupMemberState, +}; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::SledFilter; +use nexus_types::identity::{Asset, Resource}; +use omicron_common::api::external::DataPageParams; +use omicron_uuid_kinds::{ + GenericUuid, InstanceUuid, MulticastGroupUuid, SledUuid, +}; +use sled_agent_client::types::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, McastFilterMode, + McastForwardingEntry, McastForwardingNextHop, McastReplication, + McastSourceFilter, +}; + +/// Utility methods for sled-agent multicast operations used by the +/// background task reconciler. +/// +/// Groups sled-agent HTTP calls (OPTE subscriptions, M2P mappings, +/// forwarding entries) behind a single type to keep the reconciler +/// logic focused on state transitions rather than client construction. +/// +/// Unlike [`MulticastDataplaneClient`] which pre-builds per-switch +/// clients, sled clients are constructed on demand since the target +/// sled set varies per group. +/// +/// [`MulticastDataplaneClient`]: super::dataplane::MulticastDataplaneClient +pub(crate) struct MulticastSledClient { + datastore: Arc, + resolver: internal_dns_resolver::Resolver, +} + +impl MulticastSledClient { + pub(crate) fn new( + datastore: Arc, + resolver: internal_dns_resolver::Resolver, + ) -> Self { + Self { datastore, resolver } + } + + /// Create a sled-agent client for the given sled. + /// + /// Looks up the sled's address in the database and constructs an HTTP + /// client. Follows the same pattern as V2P mapping propagation. + async fn sled_client( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ) -> Result + { + nexus_networking::sled_client( + &self.datastore, + opctx, + sled_id, + &opctx.log, + ) + .await + } + + /// Build the membership descriptor sent to sled-agent for + /// subscribe/unsubscribe calls. + fn membership_for( + group: &MulticastGroup, + member: &MulticastGroupMember, + ) -> sled_agent_client::types::InstanceMulticastMembership { + sled_agent_client::types::InstanceMulticastMembership { + group_ip: group.multicast_ip.ip(), + sources: member.source_ips.iter().map(|s| s.ip()).collect(), + } + } + + /// Subscribe an instance's active VMM OPTE port to a multicast group. + /// + /// Sled-agent resolves the active Propolis under its per-instance state + /// lock and configures OPTE port-level multicast filters. The member's + /// per-instance source IPs are passed for SSM filtering. If no active + /// VMM is registered the call is a noop since the OPTE port is gone. + pub(crate) async fn subscribe_instance( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .instance_join_multicast_group(&instance_id, &membership) + .await + .context("sled-agent instance_join_multicast_group call failed")?; + + debug!( + opctx.log, + "subscribed instance to multicast group via sled-agent"; + "member_id" => %member.id, + "instance_id" => %instance_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Unsubscribe an instance's active VMM OPTE port from a multicast group. + /// + /// Best-effort since if the VMM or sled is already gone, the unsubscribe + /// is effectively a noop because the OPTE port was destroyed. + pub(crate) async fn unsubscribe_instance( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .instance_leave_multicast_group(&instance_id, &membership) + .await + .context("sled-agent instance_leave_multicast_group call failed")?; + + debug!( + opctx.log, + "unsubscribed instance from multicast group via sled-agent"; + "member_id" => %member.id, + "instance_id" => %instance_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Propagate M2P mappings and forwarding entries to all VPC-routing sleds. + /// + /// Performs convergent per-sled propagation: each sled's current state + /// is queried and diffed against desired state. New entries are added + /// and stale state is removed (member leaves, instance stops). When no + /// joined members remain, every sled has stale state and it is cleared. + /// + /// # Scope + /// + /// M2P mappings and forwarding entries are pushed to all VPC-routing + /// sleds, not just member sleds. Any instance on any sled may send to + /// a multicast group address. Hence, without the M2P mapping, OPTE's + /// overlay layer silently drops the packet. Forwarding entries point + /// each sled at a switch, which replicates to member ports via DPD + /// multicast group config. Subscriptions (per-port group membership) remain + /// member-sled-only. + pub(crate) async fn propagate_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + let underlay_ip = self + .resolve_underlay_ip(opctx, group) + .await + .with_context(|| { + format!( + "failed to resolve underlay multicast address for group {}", + group.id() + ) + })?; + + let group_ip = group.multicast_ip.ip(); + + // Compute desired state from DB, determining which sleds should have + // M2P and forwarding entries for this group. + let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); + let members = self + .datastore + .multicast_group_members_list( + opctx, + group_id, + &DataPageParams::max_page(), + ) + .await + .context("failed to list group members")?; + + let member_sled_ids: BTreeSet = members + .iter() + .filter(|m| m.state == MulticastGroupMemberState::Joined) + .filter_map(|m| m.sled_id.map(SledUuid::from)) + .collect(); + + // Build desired M2P entry. + let desired_m2p = + Mcast2PhysMapping { group: group_ip, underlay: underlay_ip }; + + // The group is active if any members are "Joined". M2P and + // forwarding are pushed to all sleds when active, cleared + // from all sleds when inactive. + let group_is_active = !member_sled_ids.is_empty(); + + // Query all VPC-routing sleds for current state and converge. + let all_sleds = self + .datastore + .sled_list_all_batched(opctx, SledFilter::VpcRouting) + .await + .context("failed to enumerate sleds")?; + + // Select one of the available switches as the forwarding next hop. + // + // OPTE treats each next hop as a duplication it performs itself, so + // pointing at individual member sleds would cause O(n) copies over + // cxgbe per sender. + // + // A single switch next hop means one copy to the switch, which + // replicates to member sled ports via DPD multicast group membership. + // ECMP over both switches is the more correct longer-term answer, + // but OPTE and mgd lack the tooling to express that today. + let switch_zone_addrs = crate::app::switch_zone_address_mappings( + &self.resolver, + &opctx.log, + ) + .await + .map_err(|e| anyhow::anyhow!(e)) + .context("failed to resolve switch zone addresses")?; + + let switch_ip = + select_forwarding_switch_ip(group_id, &switch_zone_addrs) + .context("no switch zone found for forwarding next hop")?; + + let convergence_params = GroupConvergenceParams { + group_ip, + underlay_ip, + group_is_active, + desired_m2p: &desired_m2p, + switch_ip, + }; + + // Fan out per-sled convergence so a 32-sled rack doesn't pay + // N-sequential RPC round-trips. Each sled's RPC is independent, + // we accumulate per-sled failures rather than fail-fast. + let convergence_params = &convergence_params; + let results = join_all(all_sleds.iter().map(|sled| async move { + let sled_id: SledUuid = sled.id(); + let client = match self.sled_client(opctx, sled_id).await { + Ok(c) => c, + Err(e) => { + warn!( + opctx.log, + "failed to create sled-agent client for \ + M2P/forwarding convergence"; + "sled_id" => %sled_id, + "error" => %e + ); + return Err(()); + } + }; + if let Err(e) = + converge_sled_m2p_and_forwarding(&client, convergence_params) + .await + { + warn!( + opctx.log, + "failed to converge M2P/forwarding on sled"; + "sled_id" => %sled_id, + "group_ip" => %group_ip, + "error" => %e + ); + return Err(()); + } + Ok(()) + })) + .await; + + let failed_sleds = results.iter().filter(|r| r.is_err()).count(); + + info!( + opctx.log, + "converged M2P and forwarding state"; + "group_id" => %group.id(), + "group_ip" => %group_ip, + "underlay_ip" => %underlay_ip, + "member_sleds" => member_sled_ids.len(), + "total_sleds_checked" => all_sleds.len(), + "failed_sleds" => failed_sleds + ); + + if failed_sleds > 0 { + anyhow::bail!( + "failed to converge M2P/forwarding: \ + {failed_sleds} sled convergence failures \ + (out of {} sleds)", + all_sleds.len() + ); + } + + Ok(()) + } + + async fn resolve_underlay_ip( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result { + let underlay_group_id = group + .underlay_group_id + .context("group missing underlay_group_id")?; + + match self + .datastore + .underlay_multicast_group_fetch(opctx, underlay_group_id) + .await + { + Ok(underlay_group) => match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => Ok(v6), + other => anyhow::bail!( + "underlay multicast address for group {} is {other}, \ + expected IPv6", + group.id() + ), + }, + Err(external::Error::ObjectNotFound { .. }) => { + let salt = group.underlay_salt.map_or(0, |s| *s); + match super::map_external_to_underlay_ip( + group.multicast_ip.ip(), + salt, + ) { + IpAddr::V6(v6) => Ok(v6), + IpAddr::V4(_) => anyhow::bail!( + "computed IPv4 underlay address for group {}", + group.id() + ), + } + } + Err(e) => Err(e).context("failed to fetch underlay group"), + } + } + + /// Clear M2P mappings and forwarding entries from all sleds for + /// this group. + /// + /// Delegates to the convergent [`propagate_m2p_and_forwarding`] which + /// will detect that no joined members remain and clear stale state + /// from all sleds. + /// + /// [`propagate_m2p_and_forwarding`]: Self::propagate_m2p_and_forwarding + pub(crate) async fn clear_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + self.propagate_m2p_and_forwarding(opctx, group).await + } +} + +/// Resolved group state used to converge M2P and forwarding on each sled. +struct GroupConvergenceParams<'a> { + group_ip: IpAddr, + underlay_ip: Ipv6Addr, + group_is_active: bool, + desired_m2p: &'a Mcast2PhysMapping, + /// Switch zone underlay IP chosen as the forwarding next hop. + /// The switch replicates to member sled ports via DPD config. + switch_ip: Ipv6Addr, +} + +/// Per-sled convergence of M2P and forwarding state. +/// +/// # Errors +/// +/// Returns an error when any sled-agent RPC fails (list, set, or clear). +/// The caller increments `failed_sleds` and continues to the next sled. +async fn converge_sled_m2p_and_forwarding( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + converge_m2p(client, params).await?; + converge_forwarding(client, params).await?; + Ok(()) +} + +/// Converge a single sled's M2P mapping for one group. +/// +/// Sets the mapping when the group is active and missing, clears it +/// when the group is inactive and present. Already-correct state +/// is left alone. +async fn converge_m2p( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_m2p() + .await + .context("failed to list M2P mappings on sled")? + .into_inner(); + + let has_m2p = found.iter().any(|m| { + m.group == params.group_ip && m.underlay == params.underlay_ip + }); + + match (params.group_is_active, has_m2p) { + // Active group missing M2P: install it. + (true, false) => { + client + .set_mcast_m2p(params.desired_m2p) + .await + .context("failed to add M2P mapping to sled")?; + } + // Inactive group has stale M2P: remove it. + (false, true) => { + let clear = ClearMcast2Phys { + group: params.group_ip, + underlay: params.underlay_ip, + }; + client + .clear_mcast_m2p(&clear) + .await + .context("failed to clear stale M2P from sled")?; + } + // Already converged. + _ => {} + } + + Ok(()) +} + +/// Converge a single sled's forwarding entries for one group. +/// +/// When the group is active, this sets a single next hop to the switch +/// zone. The switch replicates to member sled ports via its DPD +/// multicast group membership. When inactive, this clears any stale +/// entries. +async fn converge_forwarding( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_fwd() + .await + .context("failed to list forwarding on sled")? + .into_inner(); + + let current_entry = found.iter().find(|f| f.underlay == params.underlay_ip); + + if !params.group_is_active { + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear stale forwarding from sled")?; + } + return Ok(()); + } + + let desired_next_hops = vec![McastForwardingNextHop { + next_hop: params.switch_ip, + replication: McastReplication::Underlay, + filter: McastSourceFilter { + mode: McastFilterMode::Exclude, + sources: Vec::new(), + }, + }]; + + let needs_update = match current_entry { + Some(f) => f.next_hops != desired_next_hops, + None => true, + }; + + if needs_update { + // OPTE's set_mcast_fwd handler is additive: it inserts next + // hops but never removes stale ones. Clear first so the + // subsequent set produces an exact replacement. + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear forwarding before update")?; + } + let desired_fwd = McastForwardingEntry { + underlay: params.underlay_ip, + next_hops: desired_next_hops, + }; + client + .set_mcast_fwd(&desired_fwd) + .await + .context("failed to set forwarding on sled")?; + } + + Ok(()) +} + +fn select_forwarding_switch_ip( + group_id: MulticastGroupUuid, + switch_zone_addrs: &HashMap, +) -> Option { + let mut ordered_switches: Vec<_> = switch_zone_addrs.iter().collect(); + ordered_switches.sort_by_key(|(slot, _)| **slot); + + if ordered_switches.is_empty() { + return None; + } + + // Hash the group UUID to distribute switch selection across both + // switches. Ordering by slot keeps the selection stable across + // reconciliation passes and Nexus instances. + let mut hasher = DefaultHasher::new(); + group_id.hash(&mut hasher); + let idx = (hasher.finish() as usize) % ordered_switches.len(); + Some(*ordered_switches[idx].1) +} + +#[cfg(test)] +mod tests { + use super::select_forwarding_switch_ip; + + use std::collections::HashMap; + use std::net::Ipv6Addr; + + use omicron_uuid_kinds::{GenericUuid, MulticastGroupUuid}; + use sled_agent_types::early_networking::SwitchSlot; + use uuid::Uuid; + + #[test] + fn select_forwarding_switch_ip_returns_none_when_empty() { + let group_id = MulticastGroupUuid::from_untyped_uuid(Uuid::new_v4()); + let switch_zone_addrs = HashMap::new(); + + assert_eq!( + select_forwarding_switch_ip(group_id, &switch_zone_addrs), + None + ); + } + + #[test] + fn select_forwarding_switch_ip_is_stable_across_map_order() { + let group_id = MulticastGroupUuid::from_untyped_uuid(Uuid::new_v4()); + let switch0 = Ipv6Addr::LOCALHOST; + let switch1 = Ipv6Addr::new(0xfd00, 0, 0, 0, 0, 0, 0, 2); + + let mut first = HashMap::new(); + first.insert(SwitchSlot::Switch0, switch0); + first.insert(SwitchSlot::Switch1, switch1); + + let mut second = HashMap::new(); + second.insert(SwitchSlot::Switch1, switch1); + second.insert(SwitchSlot::Switch0, switch0); + + assert_eq!( + select_forwarding_switch_ip(group_id, &first), + select_forwarding_switch_ip(group_id, &second) + ); + } +} diff --git a/nexus/src/app/multicast/switch_zone.rs b/nexus/src/app/multicast/switch_zone.rs new file mode 100644 index 00000000000..15d65811212 --- /dev/null +++ b/nexus/src/app/multicast/switch_zone.rs @@ -0,0 +1,421 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Switch zone service clients for multicast operations. +//! +//! Wraps MGD (for MRIB programming) and DDM (for peer topology) +//! on the switch zone. Built per reconciliation pass. +//! +//! - **MRIB**: Nexus → MGD MRIB → mg-lower → DDM → peer sleds +//! - **Peers**: DDM peer info provides live sled-to-port mapping + +use std::collections::HashMap; +use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; +use std::time::Duration; + +use anyhow::anyhow; +use futures::future::try_join_all; +use internal_dns_resolver::Resolver; +use sled_agent_types::early_networking::SwitchSlot; +use slog::{Logger, debug, warn}; + +use internal_dns_types::names::ServiceName; +use mg_admin_client::types::{ + MribAddStaticRequest, MribDeleteStaticRequest, MulticastRouteKey, + MulticastRouteKeyV4, MulticastRouteKeyV6, StaticMulticastRouteInput, +}; +use omicron_common::address::{DDMD_PORT, MGD_PORT}; +use omicron_ddm_admin_client::types::PeerInfo; + +use crate::app::switch_zone_targets; + +/// Client for switch zone services used by the multicast reconciler. +/// +/// Provides access to MGD (MRIB route programming) and DDM (peer +/// topology for sled-to-port liveness). +/// +/// Built per reconciliation pass, similar to [`MulticastDataplaneClient`]. +/// +/// Note: per [omicron#10167], system-level networking (uplinkd, system-zone +/// NAT, BGP, BFD) is migrating from Nexus RPWs to sled-agent reconcilers +/// that operate based on data in the bootstore. Multicast is +/// **instance networking** (group state derives from per-instance memberships), +/// so this client's direct-to-MGD path is intentional and should be preserved +/// by the migration. +/// +/// If a future iteration tightens around MRIB writes, the +/// reconciler logic stays in Nexus and only the wire surface changes +/// (Nexus calls a sled-agent endpoint that fronts MGD). +/// +/// [`MulticastDataplaneClient`]: super::dataplane::MulticastDataplaneClient +/// [omicron#10167]: https://github.com/oxidecomputer/omicron/issues/10167 +pub(crate) struct MulticastSwitchZoneClient { + mgd_clients: HashMap, + ddm_clients: HashMap, + log: Logger, +} + +pub(crate) type MribRouteIndex = + HashMap, HashMap>>; + +// Mirrors `MulticastDataplaneClient::new`'s timeout. +const SWITCH_ZONE_BUILD_TIMEOUT: Duration = Duration::from_secs(5); + +impl MulticastSwitchZoneClient { + /// Build MGD and DDM clients for all switch zones. + /// + /// Resolves service ports from DNS rather than hardcoding them, + /// falling back to the well-known port constants when DNS lookup + /// fails. This allows the test harness to run MGD and DDM on + /// dynamic ports. + /// + /// Returns an error when no switch zones resolve, so the reconciler + /// retries rather than silently treating writes as noops. + pub(crate) async fn new( + resolver: Resolver, + log: Logger, + ) -> Result { + match tokio::time::timeout( + SWITCH_ZONE_BUILD_TIMEOUT, + Self::build(resolver, log.clone()), + ) + .await + { + Ok(result) => result, + Err(_) => Err(format!( + "timed out building switch-zone clients after \ + {SWITCH_ZONE_BUILD_TIMEOUT:?}" + )), + } + } + + async fn build(resolver: Resolver, log: Logger) -> Result { + let switch_zones = switch_zone_targets(&resolver, &log).await?; + + if switch_zones.is_empty() { + return Err( + "no switch zones resolved for multicast operations".to_string() + ); + } + + // Resolve MGD and DDM sockets from DNS, keyed by SRV target. This + // preserves distinct switch zones that share an IPv6 address in tests + // and differ only by port. + let mgd_socket_map = + resolve_service_sockets(&resolver, &log, ServiceName::Mgd).await; + let ddm_socket_map = + resolve_service_sockets(&resolver, &log, ServiceName::Ddm).await; + + let mgd_clients = switch_zones + .iter() + .map(|(slot, endpoint)| { + let socketaddr = mgd_socket_map + .get(&endpoint.target) + .copied() + .unwrap_or_else(|| { + SocketAddrV6::new(endpoint.addr, MGD_PORT, 0, 0) + }); + ( + *slot, + mg_admin_client::Client::new( + &format!("http://{socketaddr}"), + log.clone(), + ), + ) + }) + .collect(); + + let ddm_clients = switch_zones + .iter() + .filter_map(|(slot, endpoint)| { + let socketaddr = ddm_socket_map + .get(&endpoint.target) + .copied() + .unwrap_or_else(|| { + SocketAddrV6::new(endpoint.addr, DDMD_PORT, 0, 0) + }); + match omicron_ddm_admin_client::Client::new(&log, socketaddr) { + Ok(c) => Some((*slot, c)), + Err(e) => { + warn!( + log, + "failed to build DDM client for switch zone"; + "switch" => ?slot, + "error" => %e, + ); + None + } + } + }) + .collect(); + + Ok(Self { mgd_clients, ddm_clients, log }) + } + + /// Add a multicast route to the MRIB on all switches in parallel. + /// + /// `mg-lower` watches the MRIB and automatically advertises the + /// route via DDM to peer sleds. Short-circuits on the first switch + /// failure as the reconciler retries the full set on the next pass. + pub(crate) async fn add_route( + &self, + group_ip: IpAddr, + underlay_ip: Ipv6Addr, + source: Option, + ) -> Result<(), anyhow::Error> { + let route_key = make_route_key(group_ip, source); + + let request = MribAddStaticRequest { + routes: vec![StaticMulticastRouteInput { + key: route_key, + underlay_group: underlay_ip, + }], + }; + + try_join_all(self.mgd_clients.iter().map(|(slot, client)| { + let request = &request; + async move { + client.static_add_mcast_route(request).await.map_err(|e| { + warn!( + self.log, + "mgd static_add_mcast_route failed"; + "switch" => ?slot, + "group_ip" => %group_ip, + "error" => %e, + ); + anyhow!( + "mgd static_add_mcast_route failed on switch {slot:?}: {e}" + ) + })?; + debug!( + self.log, + "added multicast route to MRIB"; + "switch" => ?slot, + "group_ip" => %group_ip, + "underlay_ip" => %underlay_ip, + ); + Ok::<(), anyhow::Error>(()) + } + })) + .await?; + Ok(()) + } + + /// Remove a multicast route from the MRIB on all switches in parallel. + /// + /// `mg-lower` detects the removal and withdraws the DDM + /// advertisement from peer sleds. Short-circuits on the first + /// switch failure as the reconciler retries on the next pass. + pub(crate) async fn remove_route( + &self, + group_ip: IpAddr, + source: Option, + ) -> Result<(), anyhow::Error> { + let route_key = make_route_key(group_ip, source); + + let request = MribDeleteStaticRequest { keys: vec![route_key] }; + + try_join_all(self.mgd_clients.iter().map(|(slot, client)| { + let request = &request; + async move { + client.static_remove_mcast_route(request).await.map_err( + |e| { + warn!( + self.log, + "mgd static_remove_mcast_route failed"; + "switch" => ?slot, + "group_ip" => %group_ip, + "error" => %e, + ); + anyhow!( + "mgd static_remove_mcast_route failed on switch {slot:?}: {e}" + ) + }, + )?; + debug!( + self.log, + "removed multicast route from MRIB"; + "switch" => ?slot, + "group_ip" => %group_ip, + ); + Ok::<(), anyhow::Error>(()) + } + })) + .await?; + Ok(()) + } + + /// List static multicast routes from all reachable switches and + /// index them by group/source/switch. + pub(crate) async fn list_routes_indexed( + &self, + ) -> Result { + let mut index = MribRouteIndex::new(); + + for (slot, client) in &self.mgd_clients { + match client.static_list_mcast_routes().await { + Ok(routes) => { + for route in routes.into_inner() { + let (group_ip, source) = route_identifier(&route.key); + index + .entry(group_ip) + .or_default() + .entry(source) + .or_default() + .insert(*slot, route.underlay_group); + } + } + Err(e) => { + warn!( + self.log, + "failed to list multicast routes from switch zone"; + "switch" => ?slot, + "error" => %e, + ); + } + } + } + + Ok(index) + } + + pub(crate) fn switch_count(&self) -> usize { + self.mgd_clients.len() + } + + /// Whether a multicast route is present in `mrib_loc` (RPF-verified) + /// on every configured switch. + /// + /// Returns `false` when the route is missing on any switch, including + /// switches that fail the RPC. The reconciler interprets `false` as + /// not-yet-forwarding (still in `mrib_in`, de-promoted by the RPF + /// revalidator, or simply unreachable) and retries on the next pass. + pub(crate) async fn route_active_on_all_switches( + &self, + group_ip: IpAddr, + source: Option, + ) -> bool { + let vni = mg_admin_client::types::Vni(u32::from( + omicron_common::api::external::Vni::DEFAULT_MULTICAST_VNI, + )); + + for (slot, client) in &self.mgd_clients { + match client + .get_mrib_selected( + None, + Some(&group_ip), + None, + source.as_ref(), + Some(&vni), + ) + .await + { + Ok(resp) => { + if resp.into_inner().is_empty() { + return false; + } + } + Err(e) => { + warn!( + self.log, + "mgd get_mrib_selected failed"; + "switch" => ?slot, + "group_ip" => %group_ip, + "error" => %e, + ); + return false; + } + } + } + + true + } + + /// Query DDM peers from all switch zones. + /// + /// Returns all peers from both switches. A sled connected to both + /// switches appears twice with different `if_name` (interface name) values, + /// one per switch port. + pub(crate) async fn get_ddm_peers( + &self, + ) -> Result, anyhow::Error> { + let mut all_peers = Vec::new(); + + for (slot, client) in &self.ddm_clients { + match client.get_peers().await { + Ok(peers) => { + all_peers.extend(peers.into_values()); + } + Err(e) => { + warn!( + self.log, + "failed to get DDM peers from switch zone"; + "switch" => ?slot, + "error" => %e, + ); + } + } + } + + Ok(all_peers) + } +} + +fn make_route_key( + group_ip: IpAddr, + source: Option, +) -> MulticastRouteKey { + let vni = mg_admin_client::types::Vni(u32::from( + omicron_common::api::external::Vni::DEFAULT_MULTICAST_VNI, + )); + match group_ip { + IpAddr::V4(v4) => MulticastRouteKey::V4(MulticastRouteKeyV4 { + group: v4, + source: source.and_then(|s| match s { + IpAddr::V4(s4) => Some(s4), + _ => None, + }), + vni, + }), + IpAddr::V6(v6) => MulticastRouteKey::V6(MulticastRouteKeyV6 { + group: v6, + source: source.and_then(|s| match s { + IpAddr::V6(s6) => Some(s6), + _ => None, + }), + vni, + }), + } +} + +/// Resolve service sockets from DNS, returning a map of SRV target to socket. +async fn resolve_service_sockets( + resolver: &Resolver, + log: &Logger, + service: ServiceName, +) -> HashMap { + match resolver.lookup_all_socket_v6_by_target(service).await { + Ok(pairs) => pairs.into_iter().collect(), + Err(e) => { + warn!( + log, + "failed to resolve service sockets from DNS, using defaults"; + "service" => ?service, + "error" => %e, + ); + HashMap::new() + } + } +} + +fn route_identifier(key: &MulticastRouteKey) -> (IpAddr, Option) { + match key { + MulticastRouteKey::V4(k) => { + (IpAddr::V4(k.group), k.source.map(IpAddr::V4)) + } + MulticastRouteKey::V6(k) => { + (IpAddr::V6(k.group), k.source.map(IpAddr::V6)) + } + } +} diff --git a/nexus/src/app/sagas/multicast_group_dpd_ensure.rs b/nexus/src/app/sagas/multicast_group_dpd_ensure.rs index 17c1fc2b3a1..48c34e1b95d 100644 --- a/nexus/src/app/sagas/multicast_group_dpd_ensure.rs +++ b/nexus/src/app/sagas/multicast_group_dpd_ensure.rs @@ -150,19 +150,21 @@ async fn mgde_fetch_group_data( .await .map_err(saga_action_failed)?; - // Validate groups are in correct state + // "Active" is allowed for crash recovery. Rejecting would tear + // down correctly-applied DPD state. match external_group.state { - nexus_db_model::MulticastGroupState::Creating => {} + nexus_db_model::MulticastGroupState::Creating + | nexus_db_model::MulticastGroupState::Active => {} other_state => { warn!( osagactx.log(), - "external group not in 'Creating' state for DPD"; + "external group not in 'Creating' or 'Active' state for DPD"; "external_group_id" => %params.external_group_id, "external_group_name" => external_group.name().as_str(), "current_state" => ?other_state ); return Err(saga_action_failed(Error::internal_error(&format!( - "External group {} is in state {other_state:?}, expected 'Creating'", + "External group {} is in state {other_state:?}, expected 'Creating' or 'Active'", params.external_group_id )))); } @@ -454,12 +456,16 @@ mod test { ); } - /// Test that the saga rejects external groups that are not in "Creating" state. + /// Test that the saga accepts "Active" groups (idempotent crash recovery) + /// but still rejects groups that are no longer in flight. /// - /// The saga validates that external groups are in "Creating" state before applying - /// DPD configuration. This test verifies that validation works correctly. + /// `mgde_fetch_group_data` allows "Creating" and "Active". Re-running the + /// saga over a group whose `mgde_update_group_state` already committed + /// must succeed through the original DAG so recovery does not roll back + /// correctly-applied DPD state. Other states (e.g., "Deleted") are still + /// out of scope and must be rejected. #[nexus_test(server = crate::Server)] - async fn test_saga_rejects_non_creating_state( + async fn test_saga_accepts_active_rejects_terminal_state( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; @@ -539,19 +545,44 @@ mod test { .await .expect("Group should transition to Active state"); - // Try to run saga on Active group - should fail + // Re-running the saga on an "Active" group simulates crash-recovery + // re-execution: every action is idempotent, so the saga must succeed + // through the original DAG rather than triggering rollback that would + // tear down correctly-applied DPD state. let params = Params { serialized_authn: Serialized::for_opctx(&opctx), external_group_id: external_group.id(), underlay_group_id: underlay_group.id, }; + nexus + .sagas + .saga_execute::(params) + .await + .expect("Saga should re-run idempotently against an Active group"); + + // Transition the group to "Deleting" and re-run the saga. The saga + // must refuse to run against a group that is no longer in "Creating" + // or "Active". + let marked = datastore + .mark_multicast_group_for_removal_if_no_members(&opctx, group_id) + .await + .expect("group should mark for removal"); + assert!(marked, "group should transition to Deleting"); + + let params = Params { + serialized_authn: Serialized::for_opctx(&opctx), + external_group_id: external_group.id(), + underlay_group_id: underlay_group.id, + }; let result = nexus .sagas .saga_execute::(params) .await; - - // Saga should reject Active group - assert!(result.is_err(), "Saga should reject group in Active state"); + assert!( + result.is_err(), + "Saga should reject group that is no longer in 'Creating' or \ + 'Active'", + ); } } diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 1d0925527a2..596797e9534 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -9,6 +9,7 @@ use omicron_common::api::external::IdentityMetadata; use omicron_sled_agent::sim; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::GenericUuid; +use std::collections::BTreeMap; use std::fmt::Debug; use std::net::Ipv6Addr; use std::time::Duration; @@ -20,6 +21,7 @@ pub use sim::TEST_RESERVOIR_RAM; pub mod background; pub mod db; pub mod http_testing; +pub mod multicast; mod nexus_test; pub mod resource_helpers; pub mod sql; @@ -117,21 +119,36 @@ async fn wait_for_producer_impl( .expect("Failed to find producer within time limit"); } -/// Build a DPD client for test validation using the first running dendrite instance +/// Build a DPD client for `Switch0` in the test fixture. +/// +/// Deterministic by default. Tests that need to validate state on every +/// switch in a multi-switch fixture should use [`dpd_clients_by_switch`] +/// instead and iterate, since each switch independently programs its own +/// underlay group / NAT / forwarding state. pub fn dpd_client( cptestctx: &ControlPlaneTestContext, ) -> dpd_client::Client { - // Get the first available dendrite instance and extract the values we need - let dendrite_guard = cptestctx.dendrite.read().unwrap(); - let (switch_slot, dendrite_instance) = dendrite_guard - .iter() - .next() - .expect("No dendrite instances running for test"); + use sled_agent_types::early_networking::SwitchSlot; + dpd_client_for(cptestctx, SwitchSlot::Switch0) +} - // Copy the values we need while the guard is still alive - let switch_slot = *switch_slot; - let port = dendrite_instance.port; - drop(dendrite_guard); +/// Build a DPD client targeting a specific switch slot. +pub fn dpd_client_for( + cptestctx: &ControlPlaneTestContext, + switch_slot: sled_agent_types::early_networking::SwitchSlot, +) -> dpd_client::Client { + let port = { + let dendrite = cptestctx.dendrite.read().unwrap(); + dendrite + .get(&switch_slot) + .unwrap_or_else(|| { + panic!( + "no dendrite instance running for {switch_slot:?} in \ + test fixture", + ) + }) + .port + }; let client_state = dpd_client::ClientState { tag: String::from("nexus-test"), @@ -145,6 +162,40 @@ pub fn dpd_client( dpd_client::Client::new(&format!("http://[{addr}]:{port}"), client_state) } +/// Build DPD clients for every switch slot in the test fixture, ordered by +/// `SwitchSlot`. +/// +/// Use this when validating a per-switch invariant (e.g., "every switch has +/// the full underlay-member set"). Iterates the dendrite map deterministically +/// so log output and assertions are stable across test passes. +pub fn dpd_clients_by_switch( + cptestctx: &ControlPlaneTestContext, +) -> BTreeMap +{ + let dendrite = cptestctx.dendrite.read().unwrap(); + dendrite + .iter() + .map(|(slot, instance)| (*slot, instance.port)) + .collect::>() + .into_iter() + .map(|(slot, port)| { + let client_state = dpd_client::ClientState { + tag: String::from("nexus-test"), + log: cptestctx.logctx.log.new(slog::o!( + "component" => "DpdClient", + "switch_slot" => format!("{slot:?}"), + )), + }; + let addr = Ipv6Addr::LOCALHOST; + let client = dpd_client::Client::new( + &format!("http://[{addr}]:{port}"), + client_state, + ); + (slot, client) + }) + .collect() +} + #[cfg(test)] mod test { use crate::TEST_SUITE_PASSWORD; diff --git a/nexus/test-utils/src/multicast.rs b/nexus/test-utils/src/multicast.rs new file mode 100644 index 00000000000..2d9179c95e2 --- /dev/null +++ b/nexus/test-utils/src/multicast.rs @@ -0,0 +1,224 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Multicast-specific Nexus integration test helpers. +//! +//! Wraps the long-running sim instances exposed by the test starter +//! (`DdmInstance`) with function-style helpers that synchronize them +//! against state in the datastore. + +use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::sync::Arc; +use std::time::Duration; + +use nexus_db_queries::context::OpContext; +use nexus_test_interface::NexusServer; +use nexus_types::deployment::SledFilter; +use nexus_types::identity::Asset; +use nexus_types::inventory::Collection; +use omicron_test_utils::dev::maghemite::{ + PeerMap, SimPeerStatus, sim_peer_info, +}; +use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; +use omicron_uuid_kinds::GenericUuid; +use slog::warn; + +use crate::ControlPlaneTestContext; + +const READY_POLL_INTERVAL: Duration = Duration::from_millis(100); +const READY_TIMEOUT: Duration = Duration::from_secs(120); + +/// Populate every switch zone's `DdmInstance` peer table from the in-service +/// sleds recorded in the datastore. +/// +/// The multicast reconciler prefers DDM peer topology and falls back to +/// inventory only when DDM is empty or unreachable. Production runs the +/// real `ddmd`, which populates peers; tests run the in-process +/// `DdmInstance` simulator, which starts with an empty peer table. +/// +/// This util synthesizes the production primary path: it waits for +/// inventory to report SP entries for every in-service sled, looks up +/// each sled's `sp_slot` from inventory the same way the reconciler's +/// fallback does (`find_sp_for_sled` matches by serial number), and +/// injects a peer per switch with the synthetic interface name +/// `tfportrear_0` matching `parse_ddm_if_name_to_port`'s +/// expected format and the rear port the inventory fallback would +/// resolve to. +/// +/// Both paths agree on port info by construction. Deriving `sp_slot` +/// from the same inventory the fallback uses guarantees that toggling +/// between the primary path and the fallback yields an identical +/// sled-to-port mapping. +/// +/// `DdmInstance::set_peers` has replace semantics, so calling this +/// multiple times always yields a fresh map. Removed sleds drop and +/// new ones appear. +/// +/// Tests that explicitly want to exercise the inventory fallback should +/// follow this call with [`clear_ddm_peers`] (or skip the helper entirely). +pub async fn populate_ddm_peers( + cptestctx: &ControlPlaneTestContext, +) { + let log = &cptestctx.logctx.log; + let datastore = cptestctx.server.datastore(); + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + + let sleds = datastore + .sled_list_all_batched(&opctx, SledFilter::InService) + .await + .expect("failed to list in-service sleds for DDM peer population"); + let current_ids: BTreeSet = + sleds.iter().map(|sled| sled.id().into_untyped_uuid()).collect(); + + // Snapshot the cache without holding the lock across the inventory + // wait. On a hit, we're done; on a miss we drop the lock, do the + // wait + build, then take the lock again to publish. Concurrent + // misses may each build their own (idempotent) `PeerMap`; last + // writer wins, which is harmless because builds converge on the + // same answer for a given sled-set. + let cached = cptestctx.multicast_ddm_peers.lock().unwrap().clone(); + let peers = match cached { + Some((ids, peers)) if ids == current_ids => peers, + _ => { + // Wait until inventory has both a sled-agent record and an + // SP entry for every in-service sled, then capture that + // collection so we can resolve `sp_slot` per sled below. + let expected_serials: HashSet = sleds + .iter() + .map(|sled| sled.serial_number().to_string()) + .collect(); + let expected_sled_ids: HashSet = + current_ids.iter().copied().collect(); + let collection = wait_for_inventory_with_sleds( + cptestctx, + &expected_sled_ids, + &expected_serials, + ) + .await; + + // Build the peer map. Match SPs to sleds by serial number + // in the same way the reconciler's inventory fallback does. + // The synthetic interface name `tfportrear_0` + // round-trips through our parser to the same rear port the + // fallback would resolve to. + let new_peers: PeerMap = sleds + .iter() + .map(|sled| { + let sp = collection + .sps + .iter() + .find(|(bb, _)| { + bb.serial_number == sled.serial_number() + && bb.part_number == sled.part_number() + }) + .or_else(|| { + collection.sps.iter().find(|(bb, _)| { + bb.serial_number == sled.serial_number() + }) + }) + .map(|(_, sp)| sp) + .unwrap_or_else(|| { + panic!( + "no inventory SP entry for sled {} (serial \ + {}); inventory subset check should have \ + caught this", + sled.id(), + sled.serial_number(), + ) + }); + let host = sled.serial_number().to_string(); + let if_name = format!("tfportrear{}_0", sp.sp_slot); + ( + host.clone(), + sim_peer_info( + sled.ip(), + &host, + &if_name, + 0, // kind: 0 = server router + SimPeerStatus::Active, + ), + ) + }) + .collect(); + + *cptestctx.multicast_ddm_peers.lock().unwrap() = + Some((current_ids, new_peers.clone())); + new_peers + } + }; + + // Iterate switches in `SwitchSlot` order so log output across test + // passes is deterministic. `set_peers` has replace semantics, so + // cloning per switch is safe and supports tests that interleave + // `clear_ddm_peers`. + let switches: BTreeMap<_, _> = cptestctx.ddm.iter().collect(); + for ddm in switches.values() { + ddm.set_peers(peers.clone()); + } +} + +/// Clear every switch zone's `DdmInstance` peer table. +/// +/// Typically, use this in tests that exercise `fetch_sled_mapping_from_inventory`. +/// +/// The reconciler treats an empty DDM peer response as having "no live topology", +/// which forces the inventory lookup production uses when DDM is genuinely +/// down. +pub fn clear_ddm_peers(cptestctx: &ControlPlaneTestContext) { + for ddm in cptestctx.ddm.values() { + ddm.set_peers(PeerMap::new()); + } +} + +/// Wait until inventory contains both a sled-agent record *and* an SP entry +/// for every sled in `expected_sled_ids`, then return the collection. +/// +/// Both checks are required: `populate_ddm_peers` synthesizes peers from +/// `collection.sps`, so a sled-agent-only check could let the helper exit +/// early and panic in the SP lookup if MGS hasn't published the SP yet. +async fn wait_for_inventory_with_sleds( + cptestctx: &ControlPlaneTestContext, + expected_sled_ids: &HashSet, + expected_serials: &HashSet, +) -> Arc { + let log = cptestctx.logctx.log.clone(); + let server = &cptestctx.server; + wait_for_condition::<_, (), _, _>( + || async { + match server.inventory_collect_and_get_latest_collection().await { + Ok(Some(collection)) => { + let inv_sled_ids: HashSet<_> = collection + .sled_agents + .iter() + .map(|sled_agent| { + sled_agent.sled_id.into_untyped_uuid() + }) + .collect(); + let inv_sp_serials: HashSet<_> = collection + .sps + .keys() + .map(|bb| bb.serial_number.to_string()) + .collect(); + + if expected_sled_ids.is_subset(&inv_sled_ids) + && expected_serials.is_subset(&inv_sp_serials) + { + Ok(Arc::new(collection)) + } else { + Err(CondCheckError::NotYet) + } + } + Ok(None) => Err(CondCheckError::NotYet), + Err(e) => { + warn!(log, "inventory fetch failed: {e}"); + Err(CondCheckError::NotYet) + } + } + }, + &READY_POLL_INTERVAL, + &READY_TIMEOUT, + ) + .await + .expect("inventory did not catch up to in-service sleds and SPs") +} diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 693aea88732..c5139b7dc42 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -35,10 +35,12 @@ use oximeter_collector::Oximeter; use oximeter_producer::Server as ProducerServer; use sled_agent_types::early_networking::SwitchSlot; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::collections::HashMap; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; use std::time::Duration; use transient_dns_server::TransientDnsServer; +use uuid::Uuid; pub struct ControlPlaneBuilder<'a> { // required @@ -117,6 +119,15 @@ pub struct ControlPlaneTestContext { /// Ports of stopped dendrite instances (for use by start_dendrite) pub stopped_dendrite_ports: RwLock>, pub mgd: HashMap, + pub ddm: HashMap, + /// Cache used by [`crate::multicast::populate_ddm_peers`] so the + /// inventory collection used to derive `sp_slot` for every sled runs + /// once per fixture per stable sled-set instead of on every call. + /// + /// This is keyed by the in-service sled-id set so the cache rebuilds + /// whenever a sled transitions in or is deemed out of service. + pub multicast_ddm_peers: + Mutex, dev::maghemite::PeerMap)>>, pub external_dns_zone_name: String, pub external_dns: TransientDnsServer, pub internal_dns: TransientDnsServer, @@ -320,6 +331,9 @@ impl ControlPlaneTestContext { for (_, mut mgd) in self.mgd { mgd.cleanup().await.unwrap(); } + for (_, mut ddm) in self.ddm { + ddm.cleanup().await; + } self.logctx.cleanup_successful(); } } diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index d05e24f10a2..81be5dbb10b 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -108,7 +108,7 @@ use std::collections::HashMap; use std::fmt::Debug; use std::iter::{once, repeat, zip}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV6}; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; use std::time::Duration; use transient_dns_server::TransientDnsServer; use uuid::Uuid; @@ -146,6 +146,7 @@ pub struct ControlPlaneStarter<'a, N: NexusServer> { pub gateway: BTreeMap, pub dendrite: RwLock>, pub mgd: HashMap, + pub ddm: HashMap, // NOTE: Only exists after starting Nexus, until external Nexus is // initialized. @@ -203,6 +204,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { gateway: BTreeMap::new(), dendrite: RwLock::new(HashMap::new()), mgd: HashMap::new(), + ddm: HashMap::new(), nexus_internal: None, nexus_internal_addr: None, external_dns_zone_name: None, @@ -461,6 +463,17 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { self.config.pkg.mgd.insert(switch_slot, config); } + pub async fn start_ddm(&mut self, switch_slot: SwitchSlot) { + let log = &self.logctx.log; + debug!(log, "Starting DDM sim"; "switch_slot" => ?switch_slot); + + let ddm = dev::maghemite::DdmInstance::start().await.unwrap(); + let port = ddm.port; + self.ddm.insert(switch_slot, ddm); + + debug!(log, "DDM sim port is {port}"); + } + pub async fn record_switch_dns( &mut self, sled_id: SledUuid, @@ -482,6 +495,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { self.dendrite.read().unwrap().get(&switch_slot).unwrap().port, self.gateway.get(&switch_slot).unwrap().port, self.mgd.get(&switch_slot).unwrap().port, + self.ddm.get(&switch_slot).unwrap().port, ) .unwrap() } @@ -1249,6 +1263,8 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { dendrite: RwLock::new(self.dendrite.into_inner().unwrap()), stopped_dendrite_ports: RwLock::new(HashMap::new()), mgd: self.mgd, + ddm: self.ddm, + multicast_ddm_peers: Mutex::new(None), external_dns_zone_name: self.external_dns_zone_name.unwrap(), external_dns: self.external_dns.unwrap(), internal_dns: self.internal_dns.unwrap(), @@ -1290,6 +1306,9 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { for (_, mut mgd) in self.mgd { mgd.cleanup().await.unwrap(); } + for (_, mut ddm) in self.ddm { + ddm.cleanup().await; + } self.logctx.cleanup_successful(); } @@ -1630,6 +1649,12 @@ pub(crate) async fn setup_with_config_impl( builder.start_mgd(SwitchSlot::Switch0).boxed() }), ), + ( + "start_ddm_switch0", + Box::new(|builder| { + builder.start_ddm(SwitchSlot::Switch0).boxed() + }), + ), ( "record_switch_dns", Box::new(|builder| { @@ -1674,6 +1699,12 @@ pub(crate) async fn setup_with_config_impl( builder.start_mgd(SwitchSlot::Switch1).boxed() }), ), + ( + "start_ddm_switch1", + Box::new(|builder| { + builder.start_ddm(SwitchSlot::Switch1).boxed() + }), + ), ( "record_switch_dns", Box::new(|builder| { diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index a32a9b86081..88e775aa266 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -212,7 +212,6 @@ fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # Use shorter TTLs for tests to ensure cache invalidation logic is exercised -multicast_reconciler.sled_cache_ttl_secs = 60 multicast_reconciler.backplane_cache_ttl_secs = 120 trust_quorum.period_secs = 60 attached_subnet_manager.period_secs = 60 diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index 350757cf1de..714880feb37 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -158,6 +158,11 @@ async fn test_nexus_boots_before_dendrite() { starter.start_mgd(SwitchSlot::Switch1).await; info!(log, "Started mgd"); + info!(log, "Starting ddm"); + starter.start_ddm(SwitchSlot::Switch0).await; + starter.start_ddm(SwitchSlot::Switch1).await; + info!(log, "Started ddm"); + info!(log, "Populating internal DNS records"); starter .record_switch_dns( @@ -197,6 +202,8 @@ async fn nexus_schema_test_setup( starter.start_dendrite(SwitchSlot::Switch1).await; starter.start_mgd(SwitchSlot::Switch0).await; starter.start_mgd(SwitchSlot::Switch1).await; + starter.start_ddm(SwitchSlot::Switch0).await; + starter.start_ddm(SwitchSlot::Switch1).await; starter.populate_internal_dns().await; } diff --git a/nexus/tests/integration_tests/multicast/cache_invalidation.rs b/nexus/tests/integration_tests/multicast/cache_invalidation.rs deleted file mode 100644 index de744c19d9d..00000000000 --- a/nexus/tests/integration_tests/multicast/cache_invalidation.rs +++ /dev/null @@ -1,645 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Integration tests for multicast reconciler cache invalidation. -//! -//! Tests inventory and backplane caches used by the multicast reconciler: -//! -//! - Sled move detection: When a sled moves to a different switch port, the -//! reconciler detects this via inventory and updates DPD port mappings -//! - Cache TTL refresh: Verifies caches are refreshed when TTL expires -//! - Backplane cache expiry: Tests that stale backplane mappings are cleaned up - -use http::{Method, StatusCode}; - -use gateway_client::types::{PowerState, RotState, SpState}; -use nexus_db_lookup::LookupPath; -use nexus_db_queries::context::OpContext; -use nexus_test_utils::resource_helpers::{ - create_default_ip_pools, create_project, -}; -use nexus_test_utils_macros::nexus_test; -use nexus_types::deployment::SledFilter; -use nexus_types::external_api::sled; -use nexus_types::inventory::SpType; -use omicron_nexus::Server; -use omicron_nexus::TestInterfaces; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid, MulticastGroupUuid}; - -use super::*; -use crate::integration_tests::instances::instance_wait_for_state; - -/// Test that multicast operations can handle physical sled movement. -/// -/// This test simulates a sled being physically moved to a different rack slot: -/// - Create a multicast group and instance, wait for member to join -/// - Verify the member is programmed on the correct rear port (based on original `sp_slot`) -/// - Run reconciler multiple times without inventory change to verify no spurious invalidation -/// - Insert a new inventory collection with a different `sp_slot` for the same sled -/// - Reconciler detects sled location change and invalidates caches automatically -/// - Verify DPD now uses the new rear port matching the new `sp_slot` -#[nexus_test(server = Server)] -async fn test_sled_move_updates_multicast_port_mapping( - cptestctx: &ControlPlaneTestContext, -) { - const PROJECT_NAME: &str = "test-project"; - const GROUP_NAME: &str = "sled-move-test-group"; - const INSTANCE_NAME: &str = "sled-move-test-instance"; - - ensure_multicast_test_ready(cptestctx).await; - - let client = &cptestctx.external_client; - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let log = &cptestctx.logctx.log; - let opctx = OpContext::for_tests(log.clone(), datastore.clone()); - - // Create project and pools in parallel - ops::join3( - create_default_ip_pools(client), - create_project(client, PROJECT_NAME), - create_multicast_ip_pool(client, "sled-move-pool"), - ) - .await; - - // Create instance (no multicast groups at creation - implicit model) - let instance = instance_for_multicast_groups( - cptestctx, - PROJECT_NAME, - INSTANCE_NAME, - true, - &[], - ) - .await; - - // Add instance to multicast group via instance-centric API - multicast_group_attach(&cptestctx, PROJECT_NAME, INSTANCE_NAME, GROUP_NAME) - .await; - wait_for_group_active(client, GROUP_NAME).await; - - let instance_uuid = InstanceUuid::from_untyped_uuid(instance.identity.id); - - // Wait for member to join - wait_for_member_state( - cptestctx, - GROUP_NAME, - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - // Verify initial port mapping (based on current inventory `sp_slot`) - verify_inventory_based_port_mapping(cptestctx, &instance_uuid) - .await - .expect("Should verify initial port mapping"); - - // Run reconciler again without new inventory to establish the - // baseline collection ID in the reconciler. Running it twice ensures the - // first run sets `last_seen_collection_id`, and the second run confirms - // no unnecessary cache invalidation occurs when collection is unchanged. - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify port mapping is unchanged (no spurious cache invalidation) - verify_inventory_based_port_mapping(cptestctx, &instance_uuid) - .await - .expect("Port mapping should be unchanged when inventory unchanged"); - - // Assert that the member is in "Joined" state - let members_before = list_multicast_group_members(client, GROUP_NAME).await; - assert_eq!(members_before.len(), 1, "should have exactly one member"); - assert_eq!( - members_before[0].state, "Joined", - "member should be in Joined state before sled move" - ); - - // Get the sled this instance is running on - let sled_id = nexus - .active_instance_info(&instance_uuid, None) - .await - .expect("Active instance info should be available") - .expect("Instance should be on a sled") - .sled_id; - - // Get sled baseboard information - let sleds = datastore - .sled_list_all_batched(&opctx, SledFilter::InService) - .await - .expect("Should list in-service sleds"); - let sled = sleds - .into_iter() - .find(|s| s.id() == sled_id) - .expect("Should find sled in database"); - - // Get current inventory to see the original sp_slot - let original_inventory = datastore - .inventory_get_latest_collection(&opctx) - .await - .expect("Should fetch latest inventory collection") - .expect("Inventory collection should exist"); - - let original_sp = original_inventory - .sps - .iter() - .find(|(bb, _)| bb.serial_number == sled.serial_number()) - .map(|(_, sp)| sp) - .expect("Should find SP for sled in original inventory"); - - let original_slot = original_sp.sp_slot; - let sled_serial = sled.serial_number().to_string(); - let sled_part_number = sled.part_number().to_string(); - - // Verify DPD has the original port before the move - let dpd = nexus_test_utils::dpd_client(cptestctx); - let original_port_id = dpd_client::types::PortId::Rear( - dpd_client::types::Rear::try_from(format!("rear{original_slot}")) - .expect("Should be valid rear port string"), - ); - - // Determine a valid target slot by querying DPD's backplane map. - // Prefer a different slot if available; otherwise fall back to the same. - let backplane = dpd - .backplane_map() - .await - .expect("Should fetch backplane map") - .into_inner(); - let mut valid_slots: Vec = backplane - .keys() - .filter_map(|k| { - k.strip_prefix("rear").and_then(|s| s.parse::().ok()) - }) - .collect(); - valid_slots.sort_unstable(); - valid_slots.dedup(); - let new_slot = valid_slots - .iter() - .copied() - .find(|s| *s != original_slot) - .unwrap_or(original_slot); - - // Build a new inventory collection with the sled in a different slot - let mut builder = nexus_inventory::CollectionBuilder::new("sled-move-test"); - builder.found_sp_state( - "test-sp", - SpType::Sled, - new_slot, - SpState { - serial_number: sled_serial, - model: sled_part_number, - power_state: PowerState::A0, - revision: 0, - base_mac_address: [0; 6], - hubris_archive_id: "test-hubris".to_string(), - rot: RotState::CommunicationFailed { - message: "test-rot-state".to_string(), - }, - }, - ); - - let new_collection = builder.build(); - - // Insert the new inventory collection - datastore - .inventory_insert_collection(&opctx, &new_collection) - .await - .expect("Should insert new inventory collection"); - - // Activate the inventory loader to update the watch channel with the new - // collection, then activate the reconciler which will detect the sled - // location change and invalidate caches. - activate_inventory_loader(&cptestctx.lockstep_client).await; - nexus.invalidate_multicast_caches(); - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify that DPD now uses the new rear port (matching new `sp_slot`) - // This helper reads the latest inventory and asserts DPD has a member - // on rear{`sp_slot`}, so it will verify the new mapping is right - verify_inventory_based_port_mapping(cptestctx, &instance_uuid) - .await - .expect("Port mapping should be updated after cache invalidation"); - - // Assert that the member is still in "Joined" state after the move - let members_after = list_multicast_group_members(client, GROUP_NAME).await; - assert_eq!(members_after.len(), 1, "should still have exactly one member"); - assert_eq!( - members_after[0].state, "Joined", - "member should still be in Joined state after sled move" - ); - assert_eq!( - members_after[0].instance_id, instance.identity.id, - "member should still reference the same instance" - ); - - // Verify stale port cleanup: fetch DPD state and ensure old port was removed - let members = datastore - .multicast_group_members_list_by_instance( - &opctx, - instance_uuid, - &DataPageParams::max_page(), - ) - .await - .expect("Should list multicast members for instance"); - let member = members - .first() - .expect("Instance should have at least one multicast membership"); - - let external_group = datastore - .multicast_group_fetch( - &opctx, - MulticastGroupUuid::from_untyped_uuid(member.external_group_id), - ) - .await - .expect("Should fetch external multicast group"); - let underlay_group_id = external_group - .underlay_group_id - .expect("External group should have underlay_group_id"); - - let underlay_group = datastore - .underlay_multicast_group_fetch(&opctx, underlay_group_id) - .await - .expect("Should fetch underlay multicast group"); - - let dpd_client = nexus_test_utils::dpd_client(cptestctx); - let underlay_group_response = dpd_client - .multicast_group_get(&underlay_group.multicast_ip.ip()) - .await - .expect("DPD multicast_group_get should succeed") - .into_inner(); - - let dpd_members = match underlay_group_response { - dpd_client::types::MulticastGroupResponse::Underlay { - members, .. - } => members, - dpd_client::types::MulticastGroupResponse::External { .. } => { - panic!("Expected Underlay group, got External"); - } - }; - - // Verify that the old port membership has been removed (stale port cleanup) - let has_old_port_member = dpd_members.iter().any(|m| { - matches!(m.direction, dpd_client::types::Direction::Underlay) - && m.port_id == original_port_id - }); - - assert!( - !has_old_port_member, - "Old underlay member with rear{original_slot} should have been removed after sled move" - ); -} - -/// Test for cache TTL behavior. -/// -/// This test verifies that both sled and backplane cache TTL expiry work correctly: -/// -/// Sled cache TTL with inventory change: -/// - Start test server with short TTLs (sled=2s, backplane=1s) -/// - Create multicast group and instance, wait for member to join -/// - Insert new inventory with different `sp_slot` (simulating sled move) -/// - Wait for sled cache TTL to expire -/// - Verify DPD uses the new rear port after reconciler refreshes cache -/// -/// Backplane cache TTL without change: -/// - Wait for backplane cache TTL to expire (tests independent expiry) -/// - Activate reconciler (refreshes expired backplane cache from DPD) -/// - Verify port mapping still works after cache refresh -#[tokio::test] -async fn test_cache_ttl_behavior() { - const PROJECT_NAME: &str = "ttl-test-project"; - const GROUP_NAME: &str = "ttl-test-group"; - const INSTANCE_NAME: &str = "ttl-test-instance"; - - // Start test server with custom config - let cptestctx = - nexus_test_utils::ControlPlaneBuilder::new("test_cache_ttl_behavior") - .customize_nexus_config(&|config| { - // Set short cache TTLs for testing - config - .pkg - .background_tasks - .multicast_reconciler - .sled_cache_ttl_secs = - chrono::TimeDelta::seconds(2).to_std().unwrap(); - config - .pkg - .background_tasks - .multicast_reconciler - .backplane_cache_ttl_secs = - chrono::TimeDelta::seconds(1).to_std().unwrap(); - - // Ensure multicast is enabled - config.pkg.multicast.enabled = true; - }) - .start::() - .await; - - ensure_multicast_test_ready(&cptestctx).await; - - // Local handles for DB and opctx - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); - - let client = &cptestctx.external_client; - - // Create project and pools in parallel - ops::join3( - create_default_ip_pools(client), - create_project(client, PROJECT_NAME), - create_multicast_ip_pool(client, "ttl-test-pool"), - ) - .await; - - // Create instance (no multicast groups at creation - implicit model) - let instance = instance_for_multicast_groups( - &cptestctx, - PROJECT_NAME, - INSTANCE_NAME, - true, - &[], - ) - .await; - - // Add instance to multicast group via instance-centric API - multicast_group_attach(&cptestctx, PROJECT_NAME, INSTANCE_NAME, GROUP_NAME) - .await; - wait_for_group_active(client, GROUP_NAME).await; - - let instance_uuid = InstanceUuid::from_untyped_uuid(instance.identity.id); - - // Wait for member to join - wait_for_member_state( - &cptestctx, - GROUP_NAME, - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - // Verify initial port mapping (this populates both caches) - verify_inventory_based_port_mapping(&cptestctx, &instance_uuid) - .await - .expect("Should verify initial port mapping"); - - // Test sled cache TTL with inventory change - - // Get the sled this instance is running on - let sled_id = nexus - .active_instance_info(&instance_uuid, None) - .await - .expect("Active instance info should be available") - .expect("Instance should be on a sled") - .sled_id; - - // Get sled baseboard information - let sleds = datastore - .sled_list_all_batched(&opctx, SledFilter::InService) - .await - .expect("Should list in-service sleds"); - let sled = sleds - .into_iter() - .find(|s| s.id() == sled_id) - .expect("Should find sled in database"); - - // Get current inventory to see the original sp_slot - let original_inventory = datastore - .inventory_get_latest_collection(&opctx) - .await - .expect("Should fetch latest inventory collection") - .expect("Inventory collection should exist"); - - let original_sp = original_inventory - .sps - .iter() - .find(|(bb, _)| bb.serial_number == sled.serial_number()) - .map(|(_, sp)| sp) - .expect("Should find SP for sled in original inventory"); - - let original_slot = original_sp.sp_slot; - let sled_serial = sled.serial_number().to_string(); - let sled_part_number = sled.part_number().to_string(); - - // Determine a valid target slot by querying DPD's backplane map. - let dpd = nexus_test_utils::dpd_client(&cptestctx); - let backplane = dpd - .backplane_map() - .await - .expect("Should fetch backplane map") - .into_inner(); - let mut valid_slots: Vec = backplane - .keys() - .filter_map(|k| { - k.strip_prefix("rear").and_then(|s| s.parse::().ok()) - }) - .collect(); - valid_slots.sort_unstable(); - valid_slots.dedup(); - let new_slot = valid_slots - .iter() - .copied() - .find(|s| *s != original_slot) - .unwrap_or(original_slot); - - // Build a new inventory collection with the sled in a different slot - let mut builder = - nexus_inventory::CollectionBuilder::new("ttl-refresh-test"); - builder.found_sp_state( - "test-sp", - SpType::Sled, - new_slot, - SpState { - serial_number: sled_serial, - model: sled_part_number, - power_state: PowerState::A0, - revision: 0, - base_mac_address: [0; 6], - hubris_archive_id: "test-hubris".to_string(), - rot: RotState::CommunicationFailed { - message: "test-rot-state".to_string(), - }, - }, - ); - - let new_collection = builder.build(); - - // Insert the new inventory collection - datastore - .inventory_insert_collection(&opctx, &new_collection) - .await - .expect("Should insert new inventory collection"); - - // Wait for sled cache TTL to expire (2 seconds) - tokio::time::sleep(std::time::Duration::from_millis(2500)).await; - - wait_for_condition_with_reconciler( - &cptestctx.lockstep_client, - || async { - // Try to verify the inventory-based port mapping - // This will succeed once DPD has been updated with the new rear port - match verify_inventory_based_port_mapping( - &cptestctx, - &instance_uuid, - ) - .await - { - Ok(()) => Ok(()), - Err(_) => { - // Not yet updated, reconciler needs another cycle - Err(CondCheckError::::NotYet) - } - } - }, - &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, - ) - .await - .expect("DPD should update with new rear port after sled cache TTL expiry"); - - // Test backplane cache TTL without change - - // Wait for backplane cache TTL to expire (1 second) - tokio::time::sleep(std::time::Duration::from_secs(1)).await; - - // Force cache access by activating reconciler - // This will cause the reconciler to check backplane cache, find it expired, - // and refresh from DPD. - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify member is still on the right port after backplane cache refresh - verify_inventory_based_port_mapping(&cptestctx, &instance_uuid) - .await - .expect("Port mapping should work after backplane cache TTL expiry"); - - // Verify member is still in "Joined" state after all cache operations - let members = list_multicast_group_members(client, GROUP_NAME).await; - assert_eq!(members.len(), 1, "should still have exactly one member"); - assert_eq!( - members[0].state, "Joined", - "member should remain in Joined state after cache operations" - ); - assert_eq!( - members[0].instance_id, instance.identity.id, - "member should still reference the same instance" - ); - - cptestctx.teardown().await; -} - -/// Verify expunged sleds are excluded from multicast cache after refresh. -#[nexus_test(extra_sled_agents = 1)] -async fn test_sled_expunge_removes_from_multicast_cache( - cptestctx: &ControlPlaneTestContext, -) { - const PROJECT_NAME: &str = "expunge-test-project"; - const GROUP_NAME: &str = "expunge-test-group"; - const INSTANCE_NAME: &str = "expunge-test-instance"; - - ensure_multicast_test_ready(cptestctx).await; - - let client = &cptestctx.external_client; - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); - - // Make the second sled non-provisionable so instances go to the first sled - let (authz_sled, ..) = LookupPath::new(&opctx, datastore) - .sled_id(cptestctx.second_sled_id()) - .lookup_for(nexus_db_queries::authz::Action::Modify) - .await - .expect("lookup authz_sled"); - datastore - .sled_set_provision_policy( - &opctx, - &authz_sled, - nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, - ) - .await - .expect("set sled provision policy"); - - ops::join3( - create_default_ip_pools(client), - create_project(client, PROJECT_NAME), - create_multicast_ip_pool(client, "expunge-test-pool"), - ) - .await; - - let instance = instance_for_multicast_groups( - cptestctx, - PROJECT_NAME, - INSTANCE_NAME, - true, - &[], - ) - .await; - - multicast_group_attach(&cptestctx, PROJECT_NAME, INSTANCE_NAME, GROUP_NAME) - .await; - wait_for_group_active(client, GROUP_NAME).await; - - let instance_uuid = InstanceUuid::from_untyped_uuid(instance.identity.id); - - wait_for_member_state( - cptestctx, - GROUP_NAME, - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - verify_inventory_based_port_mapping(&cptestctx, &instance_uuid) - .await - .expect("Should verify initial port mapping"); - - let first_sled_id = cptestctx.first_sled_id(); - cptestctx - .lockstep_client - .make_request( - Method::POST, - "/sleds/expunge", - Some(sled::SledSelector { sled: first_sled_id }), - StatusCode::OK, - ) - .await - .expect("Failed to expunge sled"); - - // Wait for instance to fail (instance-watcher marks instances on expunged sleds as "Failed") - instance_wait_for_state(client, instance_uuid, InstanceState::Failed).await; - - // Manually invalidate caches. - // - // Inventory-based invalidation is tested in - // `test_sled_move_updates_multicast_port_mapping`. This test verifies cache - // refresh uses SledFilter::InService, which excludes expunged sleds. - nexus.invalidate_multicast_caches(); - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - wait_for_member_state( - cptestctx, - GROUP_NAME, - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, - ) - .await; - - let in_service_sleds = datastore - .sled_list_all_batched(&opctx, SledFilter::InService) - .await - .expect("Failed to list in-service sleds"); - - assert!( - !in_service_sleds.iter().any(|s| s.id() == first_sled_id), - "Expunged sled should not appear in InService sled list" - ); - - let all_sleds = datastore - .sled_list_all_batched(&opctx, SledFilter::All) - .await - .expect("Failed to list all sleds"); - - assert!( - all_sleds.iter().any(|s| s.id() == first_sled_id), - "Expunged sled should still appear in All filter" - ); -} diff --git a/nexus/tests/integration_tests/multicast/failures.rs b/nexus/tests/integration_tests/multicast/failures.rs index a0f70b79320..da0177d57b8 100644 --- a/nexus/tests/integration_tests/multicast/failures.rs +++ b/nexus/tests/integration_tests/multicast/failures.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Integration tests for multicast group failure and recovery scenarios. //! @@ -214,6 +214,15 @@ async fn test_dpd_failure_during_active_state( /// When DPD is unavailable during implicit group deletion: /// - Group stays in Deleting state (cannot complete cleanup) /// - After DPD recovery, deletion completes +/// +/// This also exercises a partial-cleanup retry invariant. The deletion path +/// in `process_deleting_external_group` is sequential (MRIB withdrawal, sled +/// M2P/forwarding clear, DPD cleanup, DB delete) and bails on first failure. +/// With DPD stopped, MGD-side MRIB removal succeeds and DPD removal fails, +/// so the group stays in "Deleting". After DPD recovery the next reconciler +/// pass must re-issue MRIB removals on already-empty routes without erroring, +/// which depends on `mg_admin_client::static_remove_mcast_route` being +/// idempotent (verified at the RDB layer in maghemite). #[nexus_test] async fn test_dpd_failure_during_deleting_state( cptestctx: &ControlPlaneTestContext, @@ -231,13 +240,19 @@ async fn test_dpd_failure_during_deleting_state( ) .await; + // The single converging pass needs inventory (sled→switch port mapping) + // and DPD ready before the dpd-ensure saga runs. + ensure_inventory_ready(cptestctx).await; + ensure_dpd_ready(cptestctx).await; + // Create instance and add to group create_instance(client, project_name, instance_name).await; multicast_group_attach(cptestctx, project_name, instance_name, group_name) .await; - // Wait for group to reach Active state - wait_for_group_active(client, group_name).await; + let active_group = wait_for_group_active(client, group_name).await; + let multicast_ip = active_group.multicast_ip; + assert_mrib_route_exists(cptestctx, multicast_ip).await; // Stop DPD before triggering deletion cptestctx.stop_dendrite(SwitchSlot::Switch0).await; @@ -294,11 +309,22 @@ async fn test_dpd_failure_during_deleting_state( assert_eq!(group.identity.name.as_str(), group_name); } + // Even though the deletion did not complete, MRIB removal + // ran before the DPD step failed. The route must already be gone. + assert_mrib_route_absent(cptestctx, multicast_ip).await; + // Restart DPD and activate reconciler to complete deletion cptestctx.restart_dendrite(SwitchSlot::Switch0).await; activate_multicast_reconciler(&cptestctx.lockstep_client).await; cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(cptestctx, group_name).await; + + // The second reconciler pass re-issues MRIB removal on already-empty routes. + // If MGD treated the missing route as an error, the pass would short-circuit + // at MRIB and never retry the DPD/DB cleanup, leaving the group stuck in + // a "Deleting" state and `wait_for_group_deleted` above would time out. The + // route must remain absent and not accidentally re-install. + assert_mrib_route_absent(cptestctx, multicast_ip).await; } #[nexus_test] @@ -990,8 +1016,27 @@ async fn test_left_member_waits_for_group_active( put_upsert::<_, MulticastGroupMember>(client, &join_url, &join_params) .await; - // Verify group is stuck in "Creating" (DPD is down) - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + // Join records the member as "Joining". The reconciler demotes to + // "Left" once it observes the stopped instance. This transition is + // independent of DPD, so the assertion can advance even with + // dendrite stopped. + wait_for_condition( + || async { + let members = + list_multicast_group_members(client, group_name).await; + match members.first() { + Some(m) if m.state == "Left" => Ok(()), + _ => Err(CondCheckError::<()>::NotYet), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect( + "member should reach Left after reconciler observes stopped instance", + ); + let group: MulticastGroup = object_get(client, &format!("/v1/multicast-groups/{group_name}")).await; assert_eq!( @@ -999,7 +1044,6 @@ async fn test_left_member_waits_for_group_active( "Group should be stuck in Creating without DPD" ); - // Verify member is in "Left" state (stopped instance) let members = list_multicast_group_members(client, group_name).await; assert_eq!(members.len(), 1); assert_eq!( @@ -1021,19 +1065,15 @@ async fn test_left_member_waits_for_group_active( .unwrap(); instance_wait_for_running_with_simulation(cptestctx, instance_id).await; - // Run reconciler - member should stay in Left because group is not Active - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify member stays in "Left" (waiting for group to become Active) - let members_after = list_multicast_group_members(client, group_name).await; - assert_eq!(members_after.len(), 1); - assert_eq!( - members_after[0].state, "Left", - "Member should stay in Left while group is Creating, got: {}", - members_after[0].state - ); + activate_multicast_reconciler(&cptestctx.lockstep_client).await; + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; - // Verify group is still Creating let group_after: MulticastGroup = object_get(client, &format!("/v1/multicast-groups/{group_name}")).await; assert_eq!( diff --git a/nexus/tests/integration_tests/multicast/groups.rs b/nexus/tests/integration_tests/multicast/groups.rs index 9ae620b7dc2..bf502355ffc 100644 --- a/nexus/tests/integration_tests/multicast/groups.rs +++ b/nexus/tests/integration_tests/multicast/groups.rs @@ -358,11 +358,12 @@ async fn test_instance_multicast_endpoints( let instance = create_instance(client, project_name, instance_name).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); - // Simulate and wait for instance to be fully running with sled_id assigned + // Simulate and wait for instance to be fully running. The first + // `wait_for_member_state` @ "Joined" also waits for sled assignment + // before its assertion. let nexus = &cptestctx.server.server_context().nexus; instance_simulate(nexus, &instance_id).await; instance_wait_for_state(client, instance_id, InstanceState::Running).await; - wait_for_instance_sled_assignment(cptestctx, &instance_id).await; // Case: List instance multicast groups (should be empty initially) let instance_groups_url = format!( @@ -652,6 +653,9 @@ async fn test_instance_deletion_removes_multicast_memberships( assert_eq!(members.len(), 1, "Instance should be a member of the group"); assert_eq!(members[0].instance_id, instance.identity.id); + // Verify MRIB route exists while group is active with a joined member. + assert_mrib_route_exists(cptestctx, multicast_ip).await; + // Case: Instance deletion should clean up multicast memberships cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; @@ -666,6 +670,9 @@ async fn test_instance_deletion_removes_multicast_memberships( // Wait for reconciler to clean up DPD state (activates reconciler repeatedly until DPD confirms deletion) wait_for_group_deleted_from_dpd(cptestctx, multicast_ip).await; + + // Verify MRIB route was withdrawn after group deletion. + assert_mrib_route_absent(cptestctx, multicast_ip).await; } /// Test that the multicast_ip field is correctly populated in MulticastGroupMember API responses. @@ -1067,6 +1074,42 @@ async fn test_ssm_source_ip_behavior(cptestctx: &ControlPlaneTestContext) { "DPD external group sources should be union of all member sources" ); + // Case: (S,G) source-set narrowing on member detach. + // As specific-source members leave, the DPD union must shrink to the + // remaining members' sources. SSM groups never wildcard, so the DPD + // source list stays `Some(...)` throughout. + multicast_group_detach( + client, + project_name, + instance_names[2], + ssm_union_ip, + ) + .await; + let mut expected_after_inst3 = vec![source1, source2]; + expected_after_inst3.sort(); + wait_for_dpd_source_filter( + cptestctx, + multicast_ip, + Some(expected_after_inst3), + "DPD union should shrink to {source1, source2} after inst-3 detaches", + ) + .await; + + multicast_group_detach( + client, + project_name, + instance_names[1], + ssm_union_ip, + ) + .await; + wait_for_dpd_source_filter( + cptestctx, + multicast_ip, + Some(vec![source1]), + "DPD union should shrink to {source1} after inst-2 detaches", + ) + .await; + // Case: IPv6 source with IPv4 group should fail let ipv4_ssm_ip = "232.1.0.20"; let ipv6_source: IpAddr = "2001:db8::1".parse().unwrap(); @@ -1226,6 +1269,244 @@ async fn test_ssm_source_ip_behavior(cptestctx: &ControlPlaneTestContext) { } } +/// Read the DPD external group source filter as a sorted [`Option>`]. +/// +/// `None` indicates DPD-level source filtering is disabled (the (*,G) case +/// produced by [`compute_sources_for_dpd`] when any ASM member has empty +/// `source_ips`). `Some(sorted)` is the (S,G) union written by Nexus. +/// +/// `IpSrc::Any` entries are automatically filtered out. Nexus only emits +/// `IpSrc::Exact` today. +async fn dpd_external_source_filter( + cptestctx: &ControlPlaneTestContext, + multicast_ip: IpAddr, +) -> Option> { + let dpd_response = dpd_client(cptestctx) + .multicast_group_get(&multicast_ip) + .await + .expect("DPD should have external group") + .into_inner(); + match dpd_response { + dpd_types::MulticastGroupResponse::External { sources, .. } => sources + .map(|srcs| { + let mut ips: Vec = srcs + .iter() + .filter_map(|src| match src { + dpd_types::IpSrc::Exact(ip) => Some(*ip), + dpd_types::IpSrc::Any => None, + }) + .collect(); + ips.sort(); + ips + }), + dpd_types::MulticastGroupResponse::Underlay { .. } => { + panic!("Expected External group from DPD, got Underlay") + } + } +} + +/// Activate the multicast reconciler and poll DPD until the external group's +/// source filter matches what's expected. Use this after an attach/detach where the +/// test asserts on DPD's converged (S,G) / (*,G) state. +async fn wait_for_dpd_source_filter( + cptestctx: &ControlPlaneTestContext, + multicast_ip: IpAddr, + expected: Option>, + msg: &str, +) { + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let actual = + dpd_external_source_filter(cptestctx, multicast_ip).await; + if actual == expected { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_millis(100), + &Duration::from_secs(30), + ) + .await + .unwrap_or_else(|err| { + panic!( + "{msg}: expected {expected:?}, last observed mismatch ({err:?})", + ) + }); +} + +/// Test ASM source-filter transitions across (*,G) and (S,G). +/// +/// Source filtering for an ASM group is the union of all members' source IPs, +/// unless any member has empty `source_ips`, in which case the switch-level +/// filter is disabled (DPD `sources = None`, i.e. (*,G)). This test exercises +/// the transitions on a single group: +/// +/// 1. Specific-only union grows. Members join with disjoint sources. +/// 2. Widen (S,G) -> (*,G). An any-source member joins, and DPD `sources` +/// becomes `None`. +/// 3. Two-any-source aggregation. A second any-source member joins, then the +/// first leaves. DPD must remain `None` to prove `has_any_source_member` +/// is OR-aggregated across live members rather than a stuck flag. +/// 4. Narrow (*,G) -> (S,G). The last any-source member detaches, and DPD +/// `sources` returns to the remaining specific-source union. +/// 5. Specific union shrinks. A specific-source member detaches, and the DPD +/// union contracts. +#[nexus_test] +async fn test_asm_source_filter_transitions( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let project_name = "asm-src-transitions"; + + ops::join3( + create_project(&client, project_name), + create_default_ip_pools(&client), + create_multicast_ip_pool_with_range( + &client, + "asm-transitions-pool", + (224, 2, 0, 0), + (224, 2, 0, 100), + ), + ) + .await; + + // Instance names encode each member's role in the test sequence. The + // `specific-` prefix denotes a member that subscribes to a single source + // (an (S,G) contributor). The `any-source-` prefix denotes a member that + // subscribes to all sources, which forces the group's switch-level filter + // off (turning the group into (*,G)). The trailing letter or digit + // distinguishes peers that play the same role: two specific-source + // members exercise union growth and shrink, and two any-source members + // exercise OR-aggregation of `has_any_source_member` across live members. + let specific_a = "asm-specific-source-a"; + let specific_b = "asm-specific-source-b"; + let any_source_1 = "asm-any-source-1"; + let any_source_2 = "asm-any-source-2"; + let instance_names = [specific_a, specific_b, any_source_1, any_source_2]; + for name in &instance_names { + create_instance(client, project_name, name).await; + } + + let group_ip_str = "224.2.0.10"; + let group_ip: IpAddr = group_ip_str.parse().unwrap(); + let source_a = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)); + let source_b = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)); + + // Step 1: specific-source-a joins. DPD union is {source_a}. + multicast_group_attach_with_sources( + cptestctx, + project_name, + specific_a, + group_ip_str, + Some(vec![source_a]), + ) + .await; + wait_for_group_active(client, group_ip_str).await; + wait_for_dpd_source_filter( + cptestctx, + group_ip, + Some(vec![source_a]), + "DPD should program (S,G) with the only specific source", + ) + .await; + + // specific-source-b joins. DPD union grows to {source_a, source_b}. + multicast_group_attach_with_sources( + cptestctx, + project_name, + specific_b, + group_ip_str, + Some(vec![source_b]), + ) + .await; + let mut expected_specific = vec![source_a, source_b]; + expected_specific.sort(); + wait_for_dpd_source_filter( + cptestctx, + group_ip, + Some(expected_specific.clone()), + "DPD union should grow to include both specific sources", + ) + .await; + + // Step 2: any-source-1 joins. DPD widens (S,G) to (*,G). + multicast_group_attach_with_sources( + cptestctx, + project_name, + any_source_1, + group_ip_str, + None, + ) + .await; + wait_for_dpd_source_filter( + cptestctx, + group_ip, + None, + "DPD source filter should be disabled once any member is any-source", + ) + .await; + + // Step 3: any-source-2 joins, then any-source-1 detaches. + // `has_any_source_member` must remain true through this swap. If it were + // a stuck "ever-set" flag, removing any-source-1 would still keep DPD as + // `None` for the wrong reason. Conversely, if it were last-writer-wins, + // removing the original any-source member would incorrectly narrow back + // to (S,G). + multicast_group_attach_with_sources( + cptestctx, + project_name, + any_source_2, + group_ip_str, + None, + ) + .await; + wait_for_dpd_source_filter( + cptestctx, + group_ip, + None, + "DPD should stay disabled with two any-source members", + ) + .await; + + multicast_group_detach(client, project_name, any_source_1, group_ip_str) + .await; + wait_for_dpd_source_filter( + cptestctx, + group_ip, + None, + "DPD should stay disabled while any-source-2 is still any-source", + ) + .await; + + // Step 4: any-source-2 detaches. DPD narrows (*,G) to (S,G) back to the + // remaining specific-source union. + multicast_group_detach(client, project_name, any_source_2, group_ip_str) + .await; + wait_for_dpd_source_filter( + cptestctx, + group_ip, + Some(expected_specific), + "DPD source filter should re-enable with the remaining specific union", + ) + .await; + + // Step 5: specific-source-b detaches. DPD union contracts to {source_a}. + multicast_group_detach(client, project_name, specific_b, group_ip_str) + .await; + wait_for_dpd_source_filter( + cptestctx, + group_ip, + Some(vec![source_a]), + "DPD union should contract after the second specific member leaves", + ) + .await; + + cleanup_instances(cptestctx, client, project_name, &instance_names).await; + wait_for_group_deleted(cptestctx, group_ip_str).await; +} + /// Test default pool behavior when no pool is specified on member join. /// /// When a member joins a group without specifying a pool: diff --git a/nexus/tests/integration_tests/multicast/instances.rs b/nexus/tests/integration_tests/multicast/instances.rs index 245e284248e..139cc1487fb 100644 --- a/nexus/tests/integration_tests/multicast/instances.rs +++ b/nexus/tests/integration_tests/multicast/instances.rs @@ -21,10 +21,12 @@ //! - Instance reconfigure adding SSM: Must specify sources for new SSM groups //! - SSM sources are per-member (S,G subscription model) +use std::collections::{BTreeMap, BTreeSet}; use std::net::IpAddr; use http::{Method, StatusCode}; +use nexus_db_model::MulticastGroupMemberState; use nexus_db_queries::context::OpContext; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::resource_helpers::{ @@ -136,7 +138,7 @@ async fn test_multicast_lifecycle(cptestctx: &ControlPlaneTestContext) { "group-lifecycle-1", instances[0].identity.id, // Instance is stopped, so should be "Left" - nexus_db_model::MulticastGroupMemberState::Left, + MulticastGroupMemberState::Left, ) .await; @@ -162,16 +164,15 @@ async fn test_multicast_lifecycle(cptestctx: &ControlPlaneTestContext) { ) .await; - // Verify both instances are attached to group-lifecycle-2 - for i in 0..2 { - wait_for_member_state( - cptestctx, - "group-lifecycle-2", - instances[i + 1].identity.id, - nexus_db_model::MulticastGroupMemberState::Left, // Stopped instances - ) + // Verify both instances are attached to group-lifecycle-2 (Left state + // because the instances are stopped). + let expected_left: Vec<_> = (0..2) + .map(|i| { + (instances[i + 1].identity.id, MulticastGroupMemberState::Left) + }) + .collect(); + wait_for_members_state(cptestctx, "group-lifecycle-2", &expected_left) .await; - } // Multi-group attachment (instance to multiple groups) // Attach instance-multi-groups to group-lifecycle-3 (implicitly creates the group) @@ -204,7 +205,7 @@ async fn test_multicast_lifecycle(cptestctx: &ControlPlaneTestContext) { cptestctx, group_name, instances[3].identity.id, - nexus_db_model::MulticastGroupMemberState::Left, // Stopped instance + MulticastGroupMemberState::Left, // Stopped instance ) .await; } @@ -377,7 +378,7 @@ async fn test_multicast_group_attach_conflicts( } #[nexus_test] -async fn test_multicast_group_attach_limits( +async fn test_multicast_group_attach_multiple( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; @@ -390,14 +391,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Group names for implicit groups (implicitly created when first member joins) - let group_names = [ - "limit-test-group-0", - "limit-test-group-1", - "limit-test-group-2", - "limit-test-group-3", - "limit-test-group-4", - ]; + let group_names = + ["limit-test-group-0", "limit-test-group-1", "limit-test-group-2"]; // Create instance first (groups will be implicitly created when attached) let instance = instance_for_multicast_groups( @@ -409,8 +404,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Attach instance to 3 groups (implicitly creates each group) - let multicast_group_names = &group_names[0..3]; + // Attach instance to multiple groups (implicitly creates each group) + let multicast_group_names = &group_names; for group_name in multicast_group_names { multicast_group_attach( cptestctx, @@ -431,7 +426,7 @@ async fn test_multicast_group_attach_limits( cptestctx, group_name, instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, + MulticastGroupMemberState::Left, ) .await; } @@ -530,17 +525,13 @@ async fn test_multicast_concurrent_operations( ) .await; - // Verify all members reached correct state despite concurrent operations - for instance in instances.iter() { - wait_for_member_state( - cptestctx, - "concurrent-test-group", - instance.identity.id, - // create_instance() starts instances, so they should be Joined - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - } + // Verify all members reached correct state despite concurrent operations. + // create_instance() starts instances, so they should all be Joined. + let expected: Vec<_> = instances + .iter() + .map(|i| (i.identity.id, MulticastGroupMemberState::Joined)) + .collect(); + wait_for_members_state(cptestctx, "concurrent-test-group", &expected).await; // Verify final member count matches expected (all 4 instances) let members = @@ -585,29 +576,23 @@ async fn test_multicast_concurrent_operations( // Wait for final state to be consistent (should still have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; - // Concurrent operations during reconciler processing - - // Start a member addition and immediately follow with another operation - // This tests handling of operations that arrive while reconciler is processing - let rapid_ops_future = async { - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "concurrent-instance-3", - "concurrent-test-group", - ) - .await; - // Don't wait for reconciler; immediately do another operation - multicast_group_detach( - client, - PROJECT_NAME, - "concurrent-instance-4", - "concurrent-test-group", - ) - .await; - }; - - rapid_ops_future.await; + // Back-to-back operations without waiting for reconciler between them. + // Tests that the reconciler handles state changes that arrive while it + // is still processing a previous batch. + multicast_group_attach( + cptestctx, + PROJECT_NAME, + "concurrent-instance-3", + "concurrent-test-group", + ) + .await; + multicast_group_detach( + client, + PROJECT_NAME, + "concurrent-instance-4", + "concurrent-test-group", + ) + .await; // Wait for system to reach consistent final state (should have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; @@ -616,16 +601,12 @@ async fn test_multicast_concurrent_operations( let post_rapid_members = list_multicast_group_members(client, "concurrent-test-group").await; - // Wait for all remaining members to reach "Joined" state - for member in &post_rapid_members { - wait_for_member_state( - cptestctx, - "concurrent-test-group", - member.instance_id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - } + // Wait for all remaining members to reach "Joined" state. + let expected: Vec<_> = post_rapid_members + .iter() + .map(|m| (m.instance_id, MulticastGroupMemberState::Joined)) + .collect(); + wait_for_members_state(cptestctx, "concurrent-test-group", &expected).await; // Cleanup and delete instances (group is implicitly deleted when last member removed) cleanup_instances(cptestctx, client, PROJECT_NAME, &instance_names).await; @@ -698,7 +679,7 @@ async fn test_multicast_member_cleanup_instance_never_started( cptestctx, group_name, instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, + MulticastGroupMemberState::Left, ) .await; @@ -820,16 +801,15 @@ async fn test_multicast_migration_scenarios( cptestctx, group1_name, instance1.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, + MulticastGroupMemberState::Joined, ) .await; - // Verify DPD before migration - let dpd_client = nexus_test_utils::dpd_client(cptestctx); - dpd_client - .multicast_group_get(&multicast_ip) - .await - .expect("Group should exist in DPD before migration"); + for (slot, dpd) in nexus_test_utils::dpd_clients_by_switch(cptestctx) { + dpd.multicast_group_get(&multicast_ip).await.unwrap_or_else(|e| { + panic!("{slot:?}: group should exist in DPD before migration: {e}") + }); + } // Migrate instance let source_sled = nexus @@ -879,22 +859,106 @@ async fn test_multicast_migration_scenarios( .sled_id; assert_eq!(post_sled, target_sled, "Instance should be on target sled"); - wait_for_multicast_reconciler(lockstep_client).await; wait_for_member_state( cptestctx, group1_name, instance1.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, + MulticastGroupMemberState::Joined, ) .await; verify_inventory_based_port_mapping(cptestctx, &instance1_id) .await .expect("Port mapping should be updated"); - dpd_client - .multicast_group_get(&multicast_ip) + for (slot, dpd) in nexus_test_utils::dpd_clients_by_switch(cptestctx) { + dpd.multicast_group_get(&multicast_ip).await.unwrap_or_else(|e| { + panic!("{slot:?}: group should exist in DPD after migration: {e}") + }); + } + + // Verify sled-agent state after migration: the target sled should + // have the VMM subscription and M2P mapping. The source sled should + // not have any subscription for the old propolis. + { + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => { + panic!("Expected IPv6 underlay address, got {other}") + } + }; + + // Target sled should have the VMM subscription after the + // reconciler pushes it via verify_members. Poll because the + // reconciler may still be propagating state to the sled-agent. + let target_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == target_sled) + .unwrap() + .sled_agent(); + + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let groups = + target_agent.instance_multicast_groups.lock().unwrap(); + let has_sub = groups.get(&instance1_id).map_or(false, |g| { + g.iter().any(|m| m.group_ip == multicast_ip) + }); + if has_sub { Ok(()) } else { Err(CondCheckError::NotYet::<()>) } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) .await - .expect("Group should exist in DPD after migration"); + .expect( + "Target sled should have instance subscription after migration", + ); + + // Target sled should have M2P mapping. + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let m2p = target_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) + .await + .expect("Target sled should have M2P mapping after migration"); + + // TODO: assert the source sled no longer holds a multicast + // subscription for the old propolis_id. On real hardware, + // VMM teardown (release_opte_ports -> PortTicket::release_inner) + // clears it. The sim does not model per-propolis cleanup on + // unregister for any of the networking maps (external_ips, + // attached_subnets, multicast_groups). + } // Case: Concurrent migrations @@ -911,7 +975,9 @@ async fn test_multicast_migration_scenarios( group2_name, ) .await; + wait_for_group_active(client, group2_name).await; + multicast_group_attach( cptestctx, project_name, @@ -925,21 +991,18 @@ async fn test_multicast_migration_scenarios( .map(|i| InstanceUuid::from_untyped_uuid(i.identity.id)) .collect(); - // Start all instances via simulation - for &instance_id in &instance_ids { + // Start all instances via simulation in parallel. + ops::join_all(instance_ids.iter().map(|&instance_id| async move { instance_simulate(nexus, &instance_id).await; instance_wait_for_state(client, instance_id, InstanceState::Running) .await; - } - for inst in &instances { - wait_for_member_state( - cptestctx, - group2_name, - inst.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - } + })) + .await; + let expected_joined: Vec<_> = instances + .iter() + .map(|inst| (inst.identity.id, MulticastGroupMemberState::Joined)) + .collect(); + wait_for_members_state(cptestctx, group2_name, &expected_joined).await; // Get source/target sleds for each instance let mut source_sleds = Vec::new(); @@ -974,30 +1037,39 @@ async fn test_multicast_migration_scenarios( r.expect("Migration should initiate"); } - // Complete all migrations - for (i, &instance_id) in instance_ids.iter().enumerate() { - let info = nexus - .active_instance_info(&instance_id, None) - .await - .unwrap() - .unwrap(); - vmm_simulate_on_sled( - cptestctx, - nexus, - source_sleds[i], - info.propolis_id, - ) - .await; - vmm_simulate_on_sled( - cptestctx, - nexus, - target_sleds[i], - info.dst_propolis_id.unwrap(), - ) - .await; - instance_wait_for_state(client, instance_id, InstanceState::Running) + // Complete all migrations in parallel. + ops::join_all(instance_ids.iter().enumerate().map(|(i, &instance_id)| { + let source_sled = source_sleds[i]; + let target_sled = target_sleds[i]; + async move { + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .unwrap(); + vmm_simulate_on_sled( + cptestctx, + nexus, + source_sled, + info.propolis_id, + ) .await; - } + vmm_simulate_on_sled( + cptestctx, + nexus, + target_sled, + info.dst_propolis_id.unwrap(), + ) + .await; + instance_wait_for_state( + client, + instance_id, + InstanceState::Running, + ) + .await; + } + })) + .await; // Verify all on target sleds for (i, &instance_id) in instance_ids.iter().enumerate() { @@ -1015,8 +1087,6 @@ async fn test_multicast_migration_scenarios( ); } - wait_for_multicast_reconciler(lockstep_client).await; - let post_members = list_multicast_group_members(client, group2_name).await; assert_eq!( post_members.len(), @@ -1024,15 +1094,12 @@ async fn test_multicast_migration_scenarios( "Both members should persist after concurrent migration" ); - for inst in &instances { - wait_for_member_state( - cptestctx, - group2_name, - inst.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) + let post_migration_joined: Vec<_> = instances + .iter() + .map(|inst| (inst.identity.id, MulticastGroupMemberState::Joined)) + .collect(); + wait_for_members_state(cptestctx, group2_name, &post_migration_joined) .await; - } // Cleanup cleanup_instances( @@ -1630,18 +1697,31 @@ async fn test_ssm_without_sources_fails_create_and_reconfigure( /// /// This tests the invariant that `multicast_group_member_delete_by_group_and_instance` /// filters by both `group_id` and `instance_id`, not just `group_id`. This is -/// important for saga undo correctness: if Instance B's create saga fails after -/// joining a group, the undo must not affect Instance A's existing membership +/// important for saga undo correctness: if instance B's create saga fails after +/// joining a group, the undo must not affect instance A's existing membership /// in the same group. -#[nexus_test] +/// +/// This also verifies the shared-sled underlay invariant. Underlay membership +/// is port-scoped, not member-scoped, as members on the same sled share a +/// single rear-port entry in the underlay group. To exercise this, the test +/// forces a multi-sled layout via migration: +/// +/// - instances A and B sit on the same sled (sharing one rear port). +/// - instance C sits on the other sled (its own rear port). +/// +/// Deleting instance B must leave the rear-port set in the underlay group +/// unchanged, since A still needs the shared port and C still needs its own. +#[nexus_test(extra_sled_agents = 1)] async fn test_instance_delete_preserves_other_memberships( cptestctx: &ControlPlaneTestContext, ) { + ensure_multicast_test_ready(cptestctx).await; + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; let project_name = "delete-preserve-project"; let group_name = "delete-preserve-group"; - // Setup: create project and multicast pool ops::join3( create_default_ip_pools(client), create_project(client, project_name), @@ -1654,53 +1734,194 @@ async fn test_instance_delete_preserves_other_memberships( ) .await; - // Create Instance A and join it to the multicast group - create_instance(client, project_name, "instance-a").await; - multicast_group_attach(cptestctx, project_name, "instance-a", group_name) - .await; + let available_sleds = + [cptestctx.first_sled_id(), cptestctx.second_sled_id()]; + + // Bring up A, B, C as "Running" with the group attached. + let instances = ["instance-a", "instance-b", "instance-c"].iter().map( + |name| async move { + let inst = instance_for_multicast_groups( + cptestctx, + project_name, + name, + true, + &[group_name], + ) + .await; + let id = InstanceUuid::from_untyped_uuid(inst.identity.id); + instance_simulate(nexus, &id).await; + instance_wait_for_state(client, id, InstanceState::Running).await; + (inst, id) + }, + ); + let started: Vec<(Instance, InstanceUuid)> = ops::join_all(instances).await; + let (instance_a, instance_a_uuid) = &started[0]; + let (_instance_b, instance_b_uuid) = &started[1]; + let (instance_c, instance_c_uuid) = &started[2]; + wait_for_group_active(client, group_name).await; + let initial_joined: Vec<_> = started + .iter() + .map(|(inst, _)| (inst.identity.id, MulticastGroupMemberState::Joined)) + .collect(); + wait_for_members_state(cptestctx, group_name, &initial_joined).await; + + // Pick a "shared" sled (where A and B will live) and a "solo" sled + // (where C will live), based on A's current placement. + let shared_sled = nexus + .active_instance_info(instance_a_uuid, None) + .await + .unwrap() + .expect("instance A should be on a sled") + .sled_id; + let solo_sled = *available_sleds + .iter() + .find(|&&s| s != shared_sled) + .expect("two distinct sleds expected"); + + migrate_instance_to(cptestctx, *instance_b_uuid, shared_sled).await; + migrate_instance_to(cptestctx, *instance_c_uuid, solo_sled).await; + + // After migration, the reconciler must observe each member's new + // sled_id before the rear-port snapshot. We explicitly + // poll until the DB row matches the post-migration placement for + // every member. + let expected_placement = [ + (instance_a.identity.id, shared_sled), + (instance_b_uuid.into_untyped_uuid(), shared_sled), + (instance_c.identity.id, solo_sled), + ]; + wait_for_member_sled_ids(cptestctx, group_name, &expected_placement).await; - // Verify Instance A is a member let members_before = list_multicast_group_members(client, group_name).await; - assert_eq!(members_before.len(), 1, "Instance A should be a member"); - let instance_a_id = members_before[0].instance_id; + assert_eq!( + members_before.len(), + 3, + "all three instances should be members" + ); - // Create Instance B and join it to the same group - create_instance(client, project_name, "instance-b").await; - multicast_group_attach(cptestctx, project_name, "instance-b", group_name) - .await; + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + let underlay_admin_ip = + fetch_underlay_admin_ip(cptestctx, multicast_ip).await; + + // Each switch independently programs the full set of rear ports for the + // group's underlay members. Read every switch's underlay group so a + // missing-on-one-switch fanout regression is caught. + let collect_rear_ports_by_switch = || { + let admin_ip = underlay_admin_ip.clone(); + let dpd_clients = nexus_test_utils::dpd_clients_by_switch(cptestctx); + async move { + let mut by_switch: BTreeMap<_, BTreeSet<_>> = BTreeMap::new(); + for (slot, dpd) in dpd_clients { + let resp = dpd + .multicast_group_get_underlay(&admin_ip) + .await + .expect("underlay group should exist in DPD") + .into_inner(); + // Key on (port_id, link_id) so breakout-link members are + // distinguished and any link/direction drift would be + // visible as a set difference. + let ports: BTreeSet<_> = resp + .members + .into_iter() + .filter(|m| { + matches!(m.port_id, dpd_client::types::PortId::Rear(_)) + && m.direction + == dpd_client::types::Direction::Underlay + }) + .map(|m| (m.port_id, m.link_id)) + .collect(); + by_switch.insert(slot, ports); + } + by_switch + } + }; - // Verify both instances are now members - let members_with_b = list_multicast_group_members(client, group_name).await; - assert_eq!(members_with_b.len(), 2, "Both instances should be members"); + // The DB sled_id update precedes DPD programming, and switches are + // updated independently per pass. Poll until every switch has both + // rear-port entries (shared sled + solo sled) before snapshotting. + let rear_ports_before = wait_for_condition( + || async { + let by_switch = collect_rear_ports_by_switch().await; + if by_switch.values().all(|ports| ports.len() == 2) { + Ok(by_switch) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "underlay group did not converge to one rear-port entry per \ + occupied sled (shared by A+B, plus C's own) on every switch: \ + {e:?}" + ) + }); - // Delete Instance B, only removing B's membership, not A's + // Delete instance B. A still occupies the shared sled, so the + // shared rear port must remain; C's separate rear port must also + // remain. cleanup_instances(cptestctx, client, project_name, &["instance-b"]).await; - // Verify that Instance A's membership must still exist - let members_after_b_delete = + let members_after_b = list_multicast_group_members(client, group_name).await; - - assert_eq!( - members_after_b_delete.len(), - 1, - "Instance A's membership should survive Instance B's deletion" + assert_eq!(members_after_b.len(), 2, "A and C must survive B's deletion"); + let remaining_instance_ids: BTreeSet<_> = + members_after_b.iter().map(|m| m.instance_id).collect(); + assert!( + remaining_instance_ids.contains(&instance_a.identity.id), + "A's membership must remain" ); - assert_eq!( - members_after_b_delete[0].instance_id, instance_a_id, - "The remaining member should be Instance A" + assert!( + remaining_instance_ids.contains(&instance_c.identity.id), + "C's membership must remain" ); - // Verify the group is still active (not deleted due to last member leaving) let group = get_multicast_group(client, group_name).await; - assert_eq!( - group.state, "Active", - "Group should still be active since Instance A is still a member" - ); + assert_eq!(group.state, "Active"); + + // Per-switch DPD updates lag the member-list change. Poll until + // every switch returns to its pre-delete state: A still on the + // shared sled, C on its own. + let rear_ports_after = wait_for_condition( + || async { + let by_switch = collect_rear_ports_by_switch().await; + if by_switch == rear_ports_before { + Ok(by_switch) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "per-switch rear-port set diverged from pre-deletion snapshot \ + (expected {rear_ports_before:?}): {e:?}" + ) + }); + assert_eq!(rear_ports_after, rear_ports_before); - // Cleanup: delete Instance A, which should trigger group deletion - cleanup_instances(cptestctx, client, project_name, &["instance-a"]).await; + assert_mrib_route_exists(cptestctx, multicast_ip).await; + + // Cleanup: deleting A and C drops both rear ports and tears down + // the group. + cleanup_instances( + cptestctx, + client, + project_name, + &["instance-a", "instance-c"], + ) + .await; wait_for_group_deleted(cptestctx, group_name).await; + wait_for_group_deleted_from_dpd(cptestctx, multicast_ip).await; + assert_mrib_route_absent(cptestctx, multicast_ip).await; } /// Test IPv6 multicast group lifecycle: create, start, stop, delete. @@ -1785,16 +2006,14 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { .expect("Start should succeed"); instance_simulate(nexus, &instance_id).await; instance_wait_for_state(client, instance_id, InstanceState::Running).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_joined = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, + MulticastGroupMemberState::Joined, ) .await; - assert_eq!(member_joined.state, "Joined"); // Stop the instance - member should transition to "Left" let stop_url = @@ -1811,16 +2030,14 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { instance_simulate(nexus, &instance_id).await; instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_left = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, + MulticastGroupMemberState::Left, ) .await; - assert_eq!(member_left.state, "Left"); // Delete the instance - this should delete the group since it's the only member cleanup_instances(cptestctx, client, project_name, &["ipv6-instance"]) @@ -1884,7 +2101,7 @@ async fn test_group_with_all_members_left(cptestctx: &ControlPlaneTestContext) { cptestctx, group_name, instance1.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, + MulticastGroupMemberState::Joined, ) .await; @@ -1915,7 +2132,7 @@ async fn test_group_with_all_members_left(cptestctx: &ControlPlaneTestContext) { cptestctx, group_name, instance2.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, + MulticastGroupMemberState::Joined, ) .await; @@ -1937,21 +2154,14 @@ async fn test_group_with_all_members_left(cptestctx: &ControlPlaneTestContext) { instance_wait_for_state(client, id, InstanceState::Stopped).await; } - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify both members are "Left" - wait_for_member_state( + wait_for_members_state( cptestctx, group_name, - instance1.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, - ) - .await; - wait_for_member_state( - cptestctx, - group_name, - instance2.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, + &[ + (instance1.identity.id, MulticastGroupMemberState::Left), + (instance2.identity.id, MulticastGroupMemberState::Left), + ], ) .await; @@ -1977,13 +2187,12 @@ async fn test_group_with_all_members_left(cptestctx: &ControlPlaneTestContext) { instance_simulate(nexus, &id1).await; instance_wait_for_state(client, id1, InstanceState::Running).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; wait_for_member_state( cptestctx, group_name, instance1.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, + MulticastGroupMemberState::Joined, ) .await; diff --git a/nexus/tests/integration_tests/multicast/mod.rs b/nexus/tests/integration_tests/multicast/mod.rs index cc3c947008c..8daf3bb12de 100644 --- a/nexus/tests/integration_tests/multicast/mod.rs +++ b/nexus/tests/integration_tests/multicast/mod.rs @@ -16,8 +16,7 @@ use std::future::Future; use std::net::IpAddr; -use std::sync::{Arc, Mutex}; -use std::time::{Duration, Instant}; +use std::time::Duration; use dropshot::test_util::ClientTestContext; use http::{Method, StatusCode}; @@ -42,13 +41,16 @@ use nexus_types::external_api::multicast::{ MulticastGroupJoinSpec, MulticastGroupMember, }; use nexus_types::identity::{Asset, Resource}; +use nexus_types::internal_api::params::InstanceMigrateRequest; use omicron_common::api::external::{ ByteCount, DataPageParams, Hostname, IdentityMetadataCreateParams, Instance, InstanceCpuCount, InstanceState, }; use omicron_nexus::TestInterfaces; use omicron_test_utils::dev::poll::{self, CondCheckError, wait_for_condition}; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid, MulticastGroupUuid}; +use omicron_uuid_kinds::{ + GenericUuid, InstanceUuid, MulticastGroupUuid, SledUuid, +}; use crate::integration_tests::instances as instance_helpers; use sled_agent_client::TestInterfaces as SledAgentTestInterfaces; @@ -59,7 +61,6 @@ pub(crate) type ControlPlaneTestContext = mod api; mod authorization; -mod cache_invalidation; mod enablement; mod failures; mod groups; @@ -69,6 +70,7 @@ mod pool_selection; // Timeout constants for test operations const POLL_INTERVAL: Duration = Duration::from_millis(50); +const POLL_TIMEOUT: Duration = Duration::from_secs(30); const MULTICAST_OPERATION_TIMEOUT: Duration = Duration::from_secs(120); /// Generic helper for PUT upsert requests that return 201 Created. @@ -211,6 +213,11 @@ pub(crate) async fn create_multicast_ip_pool_v6( pool } +/// The reconciler can take longer than the default 10s timeout under +/// parallel test load, especially after the CRDB graceful-shutdown +/// change (eb8ae2f8f). 30s matches other heavy background task timeouts. +const RECONCILER_ACTIVATION_TIMEOUT: Duration = Duration::from_secs(30); + /// Waits for the multicast group reconciler to complete. /// /// This wraps wait_background_task with the correct task name. @@ -231,35 +238,24 @@ pub(crate) async fn wait_for_multicast_reconciler( pub(crate) async fn activate_multicast_reconciler( lockstep_client: &ClientTestContext, ) -> nexus_lockstep_client::types::BackgroundTask { - nexus_test_utils::background::activate_background_task( + nexus_test_utils::background::activate_background_task_with_timeout( lockstep_client, "multicast_reconciler", + RECONCILER_ACTIVATION_TIMEOUT, ) .await } -/// Activates the inventory loader and waits for it to complete. +/// Activate the multicast reconciler once, then poll `condition` until it +/// holds (or `timeout` elapses). /// -/// This ensures the watch channel has the latest inventory collection from the database. -pub(crate) async fn activate_inventory_loader( - lockstep_client: &ClientTestContext, -) -> nexus_lockstep_client::types::BackgroundTask { - nexus_test_utils::background::activate_background_task( - lockstep_client, - "inventory_loader", - ) - .await -} - -/// Wait for a condition to be true, activating the reconciler periodically. -/// -/// This is like `wait_for_condition` but activates the multicast reconciler -/// periodically (not on every poll) to drive state changes. We activate the -/// reconciler every 500ms. -/// -/// Useful for tests that need to wait for reconciler-driven state changes -/// (e.g., member state transitions). -pub(crate) async fn wait_for_condition_with_reconciler( +/// For tests that expect convergence in a single reconciler pass. We +/// poll after the activation to absorb read-after-write visibility lag +/// (DB commits, sled-agent state propagation), not to wait for further +/// reconciler iterations. If `condition` only holds after multiple +/// passes, the test author must orchestrate explicitly: activate per +/// step and assert intermediate state between steps. +pub(crate) async fn activate_then_wait_for_condition( lockstep_client: &ClientTestContext, condition: F, poll_interval: &Duration, @@ -269,37 +265,8 @@ where F: Fn() -> Fut, Fut: Future>>, { - // Activate reconciler less frequently than we check the condition - // This reduces overhead while still driving state changes forward - const RECONCILER_ACTIVATION_INTERVAL: Duration = Duration::from_millis(500); - - let last_reconciler_activation = Arc::new(Mutex::new(Instant::now())); - - // First, wait for any already-activated reconciler run to complete. - // This tests explicit activation paths (saga completions, etc.). - wait_for_multicast_reconciler(lockstep_client).await; - - wait_for_condition( - || async { - // Only activate reconciler if enough time has passed - let now = Instant::now(); - let should_activate = { - let last = last_reconciler_activation.lock().unwrap(); - now.duration_since(*last) >= RECONCILER_ACTIVATION_INTERVAL - }; - - if should_activate { - // Use activate to drive progress - activate_multicast_reconciler(lockstep_client).await; - *last_reconciler_activation.lock().unwrap() = now; - } - - condition().await - }, - poll_interval, - timeout, - ) - .await + activate_multicast_reconciler(lockstep_client).await; + wait_for_condition(condition, poll_interval, timeout).await } /// Ensure inventory collection has completed with SP data for all sleds. @@ -307,8 +274,8 @@ where /// This function verifies that inventory has SP data for EVERY in-service sled, /// not just that inventory completed. /// -/// This is required for multicast member operations which map `sled_id` → `sp_slot` -/// → switch ports via inventory. +/// This is required for multicast member operations which map `sled_id` to +/// `sp_slot` to switch ports via inventory. pub(crate) async fn ensure_inventory_ready( cptestctx: &ControlPlaneTestContext, ) { @@ -358,9 +325,8 @@ pub(crate) async fn ensure_inventory_ready( let mut missing_sleds = Vec::new(); for sled in &sleds { let has_sp = inventory.sps.iter().any(|(bb, _)| { - (bb.serial_number == sled.serial_number() - && bb.part_number == sled.part_number()) - || bb.serial_number == sled.serial_number() + bb.serial_number == sled.serial_number() + && bb.part_number == sled.part_number() }); if !has_sp { @@ -385,8 +351,8 @@ pub(crate) async fn ensure_inventory_ready( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(500), // Check every 500ms - &Duration::from_secs(120), // Wait up to 120s + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, ) .await { @@ -448,8 +414,8 @@ pub(crate) async fn ensure_dpd_ready(cptestctx: &ControlPlaneTestContext) { } } }, - &Duration::from_millis(200), // Check every 200ms - &Duration::from_secs(30), // Wait up to 30 seconds for switches + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await { @@ -552,11 +518,20 @@ pub(crate) async fn wait_for_group_active( .await } -/// Wait for a specific member to reach the expected state -/// (e.g., Joined, Joining, Left). +/// Wait for a multicast group member to reach the expected state. +/// +/// Ensures inventory and DPD are ready, drives one reconciler activation, +/// then asserts the member is observable in `expected_state`. If the state +/// does not match after the pass, fails loudly rather than retrying via +/// reactivation. +/// +/// We poll briefly after the pass to absorb DB read-after-write lag, +/// not to wait for further reconciler iterations. /// -/// For "Joined" state, this function uses `wait_for_condition_with_reconciler` -/// to ensure the reconciler processes member state transitions. +/// Tests that genuinely need multi-step convergence (e.g., recovery from +/// an injected external failure) must orchestrate explicitly: drive each +/// step with `activate_multicast_reconciler` and assert the intermediate +/// state between steps. pub(crate) async fn wait_for_member_state( cptestctx: &ControlPlaneTestContext, group_name: &str, @@ -567,92 +542,138 @@ pub(crate) async fn wait_for_member_state( let lockstep_client = &cptestctx.lockstep_client; let expected_state_as_str = expected_state.to_string(); - // For "Joined" state, ensure instance has a sled_id assigned - // (no need to check inventory again since ensure_inventory_ready() already - // verified all sleds have SP data at test setup) + // "Joined" requires the dataplane: the reconciler resolves + // sled→port and programs DPD before that transition. Pre-populate + // DDM peers and wait for DPD readiness before polling for it. + // + // "Joining" and "Left" converge from DB-only transitions, so + // don't gate those as failure-mode tests rely on being able to wait + // on them with working DPD stopped. if expected_state == nexus_db_model::MulticastGroupMemberState::Joined { + nexus_test_utils::multicast::populate_ddm_peers(cptestctx).await; + ensure_dpd_ready(cptestctx).await; let instance_uuid = InstanceUuid::from_untyped_uuid(instance_id); wait_for_instance_sled_assignment(cptestctx, &instance_uuid).await; } + // Drive one converging pass. This explicit activate guarantees a fresh + // pass runs after this point regardless of whether the API call that + // triggered the test already activated the reconciler. + activate_multicast_reconciler(lockstep_client).await; + + // Verify the post-pass state. Treat read-after-write visibility lag as + // `NotYet`, but treat any *other* observed state as a permanent failure. let check_member = || async { let members = list_multicast_group_members(client, group_name).await; - - // If we're looking for "Joined" state, we need to ensure the member exists first - // and then wait for the reconciler to process it - if expected_state == nexus_db_model::MulticastGroupMemberState::Joined { - if let Some(member) = - members.iter().find(|m| m.instance_id == instance_id) - { - match member.state.as_str() { - "Joined" => Ok(member.clone()), - "Joining" => { - // Member exists and is in transition - wait a bit more - Err(CondCheckError::NotYet) - } - "Left" => { - // Member in Left state, reconciler needs to process instance start - wait more - Err(CondCheckError::NotYet) - } - other_state => Err(CondCheckError::Failed(format!( - "Member {instance_id} in group {group_name} has unexpected state '{other_state}', expected 'Left', 'Joining' or 'Joined'" - ))), - } - } else { - // Member doesn't exist yet - wait for it to be created - Err(CondCheckError::NotYet) - } - } else { - // For other states, just look for exact match - if let Some(member) = - members.iter().find(|m| m.instance_id == instance_id) - { - if member.state == expected_state_as_str { - Ok(member.clone()) - } else { - Err(CondCheckError::NotYet) - } - } else { - Err(CondCheckError::NotYet) + match members.iter().find(|m| m.instance_id == instance_id) { + Some(member) if member.state == expected_state_as_str => { + Ok(member.clone()) } + Some(member) => Err(CondCheckError::Failed(format!( + "member {instance_id} in group {group_name} reached state \ + '{}' after one reconciler pass, expected '{expected_state_as_str}'", + member.state + ))), + None => Err(CondCheckError::NotYet), } }; - // Use reconciler-activating wait for "Joined" state - let res = if expected_state - == nexus_db_model::MulticastGroupMemberState::Joined + match wait_for_condition( + check_member, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await { - wait_for_condition_with_reconciler( - lockstep_client, - check_member, - &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, - ) - .await - } else { - wait_for_condition( - check_member, - &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, - ) - .await - }; - - match res { Ok(member) => member, Err(poll::Error::TimedOut(elapsed)) => { panic!( - "member {instance_id} in group {group_name} did not reach state '{expected_state_as_str}' within {elapsed:?}", + "member {instance_id} in group {group_name} did not appear within {elapsed:?}", ); } Err(poll::Error::PermanentError(err)) => { panic!( - "failed waiting for member {instance_id} in group {group_name} to reach state '{expected_state_as_str}': {err:?}", + "reconciler did not converge member {instance_id} in group \ + {group_name} to '{expected_state_as_str}': {err}", ); } } } +/// Wait for a batch of multicast group members to reach their respective +/// expected states after a single reconciler pass. +/// +/// Like [`wait_for_member_state`] but checks multiple members after +/// one shared reconciler pass. Panics if any member ends up in an +/// unexpected state. +pub(crate) async fn wait_for_members_state( + cptestctx: &ControlPlaneTestContext, + group_name: &str, + expected: &[(Uuid, nexus_db_model::MulticastGroupMemberState)], +) -> Vec { + let client = &cptestctx.external_client; + let lockstep_client = &cptestctx.lockstep_client; + + let joined_instances: Vec = expected + .iter() + .filter(|(_, state)| { + *state == nexus_db_model::MulticastGroupMemberState::Joined + }) + .map(|(id, _)| InstanceUuid::from_untyped_uuid(*id)) + .collect(); + + if !joined_instances.is_empty() { + nexus_test_utils::multicast::populate_ddm_peers(cptestctx).await; + ensure_dpd_ready(cptestctx).await; + for instance_uuid in &joined_instances { + wait_for_instance_sled_assignment(cptestctx, instance_uuid).await; + } + } + + activate_multicast_reconciler(lockstep_client).await; + + let check = || async { + let members = list_multicast_group_members(client, group_name).await; + let mut resolved = Vec::with_capacity(expected.len()); + for (instance_id, expected_state) in expected { + let expected_str = expected_state.to_string(); + match members.iter().find(|m| m.instance_id == *instance_id) { + Some(member) if member.state == expected_str => { + resolved.push(member.clone()); + } + Some(member) => { + return Err(CondCheckError::Failed(format!( + "member {instance_id} in group {group_name} reached \ + state '{}' after one reconciler pass, expected \ + '{expected_str}'", + member.state + ))); + } + None => return Err(CondCheckError::NotYet), + } + } + Ok(resolved) + }; + + match wait_for_condition( + check, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + { + Ok(members) => members, + Err(poll::Error::TimedOut(elapsed)) => panic!( + "members in group {group_name} did not all appear within \ + {elapsed:?} (expected {expected:?})", + ), + Err(poll::Error::PermanentError(err)) => panic!( + "reconciler did not converge members in group {group_name} \ + (expected {expected:?}): {err}", + ), + } +} + /// Wait for an instance to have a sled_id assigned. /// /// This is a stricter check than `instance_wait_for_vmm_registration` - it ensures @@ -1055,7 +1076,12 @@ pub(crate) async fn wait_for_member_count( } } -/// Wait for a multicast group to be deleted (returns 404). +/// Wait for a multicast group to be fully deleted (returns 404). +/// +/// Drives one reconciler activation, which runs `process_deleting_group_inner` +/// end-to-end (M2P/forwarding clear, DPD removal, underlay delete, member +/// delete, group row delete) for groups in "Deleting". Polling around the +/// API check is only for read-after-write visibility. pub(crate) async fn wait_for_group_deleted( cptestctx: &ControlPlaneTestContext, group_name: &str, @@ -1063,25 +1089,25 @@ pub(crate) async fn wait_for_group_deleted( let client = &cptestctx.external_client; let lockstep_client = &cptestctx.lockstep_client; - match wait_for_condition_with_reconciler( - lockstep_client, - || async { - let group_url = mcast_group_url(group_name); - match NexusRequest::object_get(client, &group_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - { - Ok(response) => { - if response.status == StatusCode::NOT_FOUND { - Ok(()) - } else { - Err(CondCheckError::<()>::NotYet) - } - } - Err(_) => Ok(()), // Assume 404 or similar error means deleted - } - }, + activate_multicast_reconciler(lockstep_client).await; + + let check = || async { + let group_url = mcast_group_url(group_name); + let response = NexusRequest::new( + RequestBuilder::new(client, Method::GET, &group_url) + .expect_status(Some(StatusCode::NOT_FOUND)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await; + match response { + Ok(_) => Ok(()), + Err(_) => Err(CondCheckError::<()>::NotYet), + } + }; + + match wait_for_condition( + check, &POLL_INTERVAL, &MULTICAST_OPERATION_TIMEOUT, ) @@ -1089,7 +1115,10 @@ pub(crate) async fn wait_for_group_deleted( { Ok(_) => {} Err(poll::Error::TimedOut(elapsed)) => { - panic!("group {group_name} was not deleted within {elapsed:?}",); + panic!( + "group {group_name} was not deleted within {elapsed:?} after \ + one reconciler pass", + ); } Err(poll::Error::PermanentError(err)) => { panic!( @@ -1099,10 +1128,11 @@ pub(crate) async fn wait_for_group_deleted( } } -/// Wait for a multicast group to be deleted from DPD (dataplane) with reconciler activation. +/// Wait for a multicast group to be removed from DPD (dataplane). /// -/// This function waits for the DPD to report that the multicast group no longer exists -/// (returns 404), while periodically activating the reconciler to drive the cleanup process. +/// Drives one reconciler activation, which runs `process_deleting_group_inner` +/// (DPD `remove_groups` by tag) for groups in "Deleting". Polling around the +/// DPD GET is only for read-after-write visibility. pub(crate) async fn wait_for_group_deleted_from_dpd( cptestctx: &ControlPlaneTestContext, multicast_ip: std::net::IpAddr, @@ -1110,17 +1140,17 @@ pub(crate) async fn wait_for_group_deleted_from_dpd( let lockstep_client = &cptestctx.lockstep_client; let dpd_client = nexus_test_utils::dpd_client(cptestctx); - match wait_for_condition_with_reconciler( - lockstep_client, - || async { - match dpd_client.multicast_group_get(&multicast_ip).await { - Ok(_) => { - // Group still exists in DPD - not yet deleted - Err(CondCheckError::<()>::NotYet) - } - Err(_) => Ok(()), // Group doesn't exist - deleted - } - }, + activate_multicast_reconciler(lockstep_client).await; + + let check = || async { + match dpd_client.multicast_group_get(&multicast_ip).await { + Ok(_) => Err(CondCheckError::<()>::NotYet), + Err(_) => Ok(()), + } + }; + + match wait_for_condition( + check, &POLL_INTERVAL, &MULTICAST_OPERATION_TIMEOUT, ) @@ -1129,7 +1159,8 @@ pub(crate) async fn wait_for_group_deleted_from_dpd( Ok(_) => {} Err(poll::Error::TimedOut(elapsed)) => { panic!( - "group with IP {multicast_ip} was not deleted from DPD within {elapsed:?}", + "group with IP {multicast_ip} was not deleted from DPD within \ + {elapsed:?} after one reconciler pass", ); } Err(poll::Error::PermanentError(err)) => { @@ -1347,6 +1378,183 @@ pub(crate) async fn cleanup_instances( ops::join_all(delete_futures).await; } +/// Wait until each listed member's stored `sled_id` matches the expected +/// post-migration sled. +/// +/// [`wait_for_member_state`] for "Joined" is satisfied as soon as the +/// member is in "Joined", which can happen with the *pre-migration* +/// `sled_id` still recorded if the reconciler has not yet re-observed +/// the new active VMM. +/// Tests that snapshot dataplane state immediately after migration must +/// wait until the DB row reflects the new placement. +/// +/// Drives one reconciler activation. The members reconciler detects the +/// `member.sled_id != live_vmm.sled_id` skew and runs `handle_sled_migration` +/// inline (`members.rs:704-713`), so the row is settled by the time this +/// returns. Polling around the read is only for read-after-write visibility. +pub(crate) async fn wait_for_member_sled_ids( + cptestctx: &ControlPlaneTestContext, + group_name: &str, + expected: &[(Uuid, SledUuid)], +) { + let lockstep_client = &cptestctx.lockstep_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_id = { + let group = + get_multicast_group(&cptestctx.external_client, group_name).await; + group.identity.id + }; + + activate_multicast_reconciler(lockstep_client).await; + + let check = || async { + let members = datastore + .multicast_group_members_list( + &opctx, + MulticastGroupUuid::from_untyped_uuid(group_id), + &DataPageParams::max_page(), + ) + .await + .map_err(|e| { + CondCheckError::Failed(format!("list members failed: {e}")) + })?; + + for (instance_id, expected_sled) in expected { + let member = members + .iter() + .find(|m| m.parent_id == *instance_id) + .ok_or(CondCheckError::NotYet)?; + let sled_id = member.sled_id.ok_or(CondCheckError::NotYet)?; + if sled_id.into_untyped_uuid() != expected_sled.into_untyped_uuid() + { + return Err(CondCheckError::Failed(format!( + "member for instance {instance_id} reached sled_id \ + {sled_id:?} after one reconciler pass, expected \ + {expected_sled:?}" + ))); + } + } + Ok::<_, CondCheckError>(()) + }; + + wait_for_condition(check, &POLL_INTERVAL, &MULTICAST_OPERATION_TIMEOUT) + .await + .unwrap_or_else(|e| { + panic!( + "members in group {group_name} did not reach expected sled \ + assignments {expected:?}: {e:?}" + ) + }); +} + +/// Migrate an instance to a specific target sled. +/// +/// No-op if the instance is already on `target_sled`. Otherwise drives +/// the standard request-then-simulate-source-then-simulate-target sequence +/// used by other integration tests, returning when the instance has +/// reached `Running` on the target. +pub(crate) async fn migrate_instance_to( + cptestctx: &ControlPlaneTestContext, + instance_id: InstanceUuid, + target_sled: SledUuid, +) { + let client = &cptestctx.external_client; + let lockstep_client = &cptestctx.lockstep_client; + let nexus = &cptestctx.server.server_context().nexus; + + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("instance should be on a sled"); + if info.sled_id == target_sled { + return; + } + let source_sled = info.sled_id; + + let migrate_url = format!("/instances/{instance_id}/migrate"); + NexusRequest::new( + RequestBuilder::new(lockstep_client, Method::POST, &migrate_url) + .body(Some(&InstanceMigrateRequest { dst_sled_id: target_sled })) + .expect_status(Some(StatusCode::OK)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("should initiate migration"); + + let info = + nexus.active_instance_info(&instance_id, None).await.unwrap().unwrap(); + let src_propolis = info.propolis_id; + let dst_propolis = info.dst_propolis_id.unwrap(); + + instance_helpers::vmm_simulate_on_sled( + cptestctx, + nexus, + source_sled, + src_propolis, + ) + .await; + instance_helpers::instance_wait_for_state( + client, + instance_id, + InstanceState::Migrating, + ) + .await; + + instance_helpers::vmm_simulate_on_sled( + cptestctx, + nexus, + target_sled, + dst_propolis, + ) + .await; + instance_helpers::instance_wait_for_state( + client, + instance_id, + InstanceState::Running, + ) + .await; +} + +/// Resolve the underlay admin-local IPv6 address for a multicast group +/// given its external multicast IP. +pub(crate) async fn fetch_underlay_admin_ip( + cptestctx: &ControlPlaneTestContext, + external_multicast_ip: IpAddr, +) -> dpd_client::types::UnderlayMulticastIpv6 { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, external_multicast_ip) + .await + .expect("should look up external multicast group by IP"); + let underlay_group_id = external_group + .underlay_group_id + .expect("external group should have underlay_group_id"); + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("should fetch underlay multicast group"); + + match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => { + dpd_client::types::UnderlayMulticastIpv6::try_from(v6) + .expect("underlay IP should be admin-local IPv6") + } + IpAddr::V4(other) => { + panic!("expected IPv6 underlay address, got {other}") + } + } +} + /// Stop multiple instances, poking the simulated sled-agent while waiting. pub(crate) async fn stop_instances( cptestctx: &ControlPlaneTestContext, @@ -1533,3 +1741,121 @@ pub(crate) mod ops { tokio::join!(op1, op2, op3, op4) } } + +/// Assert that *every* mgd in the fixture has an MRIB route for `group_ip`. +/// +/// Iterates every switch zone present in `cptestctx.mgd`, so multi-switch +/// fixtures (`extra_sled_agents > 0`) catch a route that is programmed only +/// on a subset of switches. +pub(crate) async fn assert_mrib_route_exists( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, + group_ip: IpAddr, +) { + for_each_mgd(cptestctx, |slot, mgd_client| async move { + wait_for_condition::<_, (), _, _>( + || async { + let routes = mgd_client + .static_list_mcast_routes() + .await + .unwrap() + .into_inner(); + if routes + .iter() + .any(|r| mrib_route_matches_group(&r.key, group_ip)) + { + Ok(()) + } else { + Err(CondCheckError::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("mgd on {slot:?} never had a route for {group_ip}: {e:?}") + }); + }) + .await; +} + +/// Assert that *no* mgd in the fixture has an MRIB route for `group_ip`. +pub(crate) async fn assert_mrib_route_absent( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, + group_ip: IpAddr, +) { + for_each_mgd(cptestctx, |slot, mgd_client| async move { + wait_for_condition::<_, (), _, _>( + || async { + let routes = mgd_client + .static_list_mcast_routes() + .await + .unwrap() + .into_inner(); + if routes + .iter() + .any(|r| mrib_route_matches_group(&r.key, group_ip)) + { + Err(CondCheckError::NotYet) + } else { + Ok(()) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("mgd on {slot:?} still had a route for {group_ip}: {e:?}") + }); + }) + .await; +} + +/// Run `f` against every mgd client in the fixture, in `SwitchSlot` order. +async fn for_each_mgd( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, + f: F, +) where + F: Fn( + sled_agent_types::early_networking::SwitchSlot, + mg_admin_client::Client, + ) -> Fut, + Fut: Future, +{ + assert!( + !cptestctx.mgd.is_empty(), + "multicast MRIB assertions require at least one mgd in the test \ + fixture", + ); + let switches: std::collections::BTreeMap<_, _> = + cptestctx.mgd.iter().collect(); + for (slot, mgd) in switches { + let mgd_client = mg_admin_client::Client::new( + &format!("http://[::1]:{}", mgd.port), + cptestctx.logctx.log.clone(), + ); + f(*slot, mgd_client).await; + } +} + +fn mrib_route_matches_group( + key: &mg_admin_client::types::MulticastRouteKey, + group_ip: IpAddr, +) -> bool { + match (key, group_ip) { + (mg_admin_client::types::MulticastRouteKey::V4(k), IpAddr::V4(ip)) => { + k.group == ip + } + (mg_admin_client::types::MulticastRouteKey::V6(k), IpAddr::V6(ip)) => { + k.group == ip + } + _ => false, + } +} diff --git a/nexus/tests/integration_tests/multicast/networking_integration.rs b/nexus/tests/integration_tests/multicast/networking_integration.rs index 3b28892ef82..218897f56b6 100644 --- a/nexus/tests/integration_tests/multicast/networking_integration.rs +++ b/nexus/tests/integration_tests/multicast/networking_integration.rs @@ -8,10 +8,14 @@ //! //! - External IPs: Instances with ephemeral/floating IPs can join multicast groups //! - Floating IP attach/detach: Multicast membership unaffected by IP changes +//! - Sled-agent M2P/forwarding propagation on member join and group deletion +//! - Per-VMM multicast subscriptions via sled-agent -use std::time::Duration; +use std::net::IpAddr; use http::{Method, StatusCode}; +use nexus_db_lookup::LookupPath; +use nexus_db_queries::context::OpContext; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::resource_helpers::create_floating_ip; use nexus_test_utils::resource_helpers::{ @@ -30,6 +34,7 @@ use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, Instance, InstanceCpuCount, NameOrId, }; +use omicron_nexus::TestInterfaces; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; @@ -109,8 +114,6 @@ async fn test_multicast_external_ip_scenarios( instance_wait_for_running_with_simulation(cptestctx, instance_uuid) .await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Add instance to multicast group via instance-centric API multicast_group_attach( cptestctx, @@ -181,9 +184,6 @@ async fn test_multicast_external_ip_scenarios( ); object_delete(client, &external_ip_detach_url).await; - // Wait for operations to settle - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify multicast membership is still intact after external IP removal let members_after_detach = list_multicast_group_members(client, group_name).await; @@ -255,8 +255,6 @@ async fn test_multicast_external_ip_scenarios( instance_wait_for_running_with_simulation(cptestctx, instance_uuid) .await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Add instance to multicast group via instance-centric API multicast_group_attach( cptestctx, @@ -302,9 +300,6 @@ async fn test_multicast_external_ip_scenarios( .await .unwrap(); - // Wait for dataplane configuration to settle - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify multicast state is preserved let members_with_ip = list_multicast_group_members(client, group_name).await; @@ -336,9 +331,6 @@ async fn test_multicast_external_ip_scenarios( ); object_delete(client, &external_ip_detach_url).await; - // Wait for operations to settle - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify multicast state is still preserved let members_without_ip = list_multicast_group_members(client, group_name).await; @@ -418,8 +410,6 @@ async fn test_multicast_external_ip_scenarios( instance_wait_for_running_with_simulation(cptestctx, instance_uuid) .await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify external IP was allocated at creation let external_ips_after_start = fetch_instance_external_ips(client, instance_name, project_name) @@ -537,7 +527,6 @@ async fn test_multicast_with_floating_ip_basic( let instance_id = instance.identity.id; let instance_uuid = InstanceUuid::from_untyped_uuid(instance_id); - wait_for_instance_sled_assignment(cptestctx, &instance_uuid).await; instance_wait_for_running_with_simulation(cptestctx, instance_uuid).await; // Ensure multicast test prerequisites (inventory + DPD) are ready @@ -546,24 +535,7 @@ async fn test_multicast_with_floating_ip_basic( // Add instance to multicast group via instance-centric API multicast_group_attach(cptestctx, project_name, instance_name, group_name) .await; - // Group activation is reconciler-driven; explicitly drive it to avoid flakes. - wait_for_condition_with_reconciler( - &cptestctx.lockstep_client, - || async { - let group = get_multicast_group(client, group_name).await; - if group.state == "Active" { - Ok(()) - } else { - Err(CondCheckError::::NotYet) - } - }, - &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, - ) - .await - .unwrap_or_else(|e| { - panic!("group {group_name} did not reach Active state in time: {e:?}") - }); + wait_for_group_active(client, group_name).await; // Wait for multicast member to reach "Joined" state wait_for_member_state( @@ -637,13 +609,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance did not show floating IP {} as attached within 30s: {e:?}", + "instance did not show floating IP {} as attached within {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -694,13 +666,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance still showed floating IP {} as attached after 30s: {e:?}", + "instance still showed floating IP {} as attached after {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -713,3 +685,995 @@ async fn test_multicast_with_floating_ip_basic( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(cptestctx, group_name).await; } + +/// Verify that when an instance joins a multicast group, the reconciler +/// pushes M2P mappings, forwarding entries, and per-VMM subscriptions +/// to the sim sled-agent. Also verify cleanup on instance deletion. +#[nexus_test] +async fn test_multicast_sled_agent_m2p_and_subscriptions( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, +) { + let client = &cptestctx.external_client; + let project_name = "sled-agent-mcast-project"; + let group_name = "sled-agent-mcast-group"; + let instance_name = "sled-agent-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "sled-agent-mcast-pool", + (224, 150, 0, 1), + (224, 150, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Create and start an instance. + let instance_params = InstanceCreate { + identity: IdentityMetadataCreateParams { + name: instance_name.parse().unwrap(), + description: "Instance for sled-agent multicast test".to_string(), + }, + ncpus: InstanceCpuCount::try_from(1).unwrap(), + memory: ByteCount::from_gibibytes_u32(1), + hostname: instance_name.parse().unwrap(), + user_data: vec![], + ssh_public_keys: None, + network_interfaces: InstanceNetworkInterfaceAttachment::DefaultIpv4, + external_ips: vec![], + multicast_groups: vec![], + disks: vec![], + boot_disk: None, + cpu_platform: None, + start: true, + auto_restart_policy: Default::default(), + anti_affinity_groups: Vec::new(), + }; + + let instance_url = format!("/v1/instances?project={project_name}"); + let instance: Instance = + object_create(client, &instance_url, &instance_params).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + // Attach instance to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + // "Joined" convergence drives the DDM primary path via the + // `populate_ddm_peers` precondition baked into `wait_for_member_state`. + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay multicast IPv6 address for verification. + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Verify MRIB route was programmed on mgd. + assert_mrib_route_exists(cptestctx, multicast_ip).await; + + // Verify M2P mapping on the sim sled-agent. + let sled_agent = cptestctx.first_sled_agent(); + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + m2p.contains(&(multicast_ip, underlay_ipv6)), + "Sled-agent should have M2P mapping ({multicast_ip}, \ + {underlay_ipv6}), got: {m2p:?}" + ); + } + + // Verify forwarding entries on the sim sled-agent. + // The forwarding entry points at a switch for replication. + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + fwd.contains_key(&underlay_ipv6), + "Sled-agent should have forwarding entry for {underlay_ipv6}, \ + got: {fwd:?}" + ); + let next_hops = &fwd[&underlay_ipv6]; + assert_eq!( + next_hops.len(), + 1, + "Should have 1 next_hop (a switch), got: {next_hops:?}" + ); + } + + // Verify per-VMM multicast subscription on the sim sled-agent. + { + let groups = sled_agent.instance_multicast_groups.lock().unwrap(); + let instance_groups = groups + .get(&instance_id) + .expect("Sled-agent should have multicast groups for instance"); + + assert!( + instance_groups.iter().any(|m| m.group_ip == multicast_ip), + "Instance should be subscribed to multicast group \ + {multicast_ip}, got: {instance_groups:?}" + ); + } + + // Stop the instance. The member transitions "Joined" -> "Left". + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Per-VMM subscription cleanup after stop is not asserted here. + // In production, destroying the VMM tears down the OPTE port, which + // implicitly removes multicast subscriptions. The reconciler's + // unsubscribe path correctly skips when the propolis_id is gone + // (matching production semantics where the port no longer exists). + // + // V2P follows the same pattern: sled-agent cleanup is keyed by + // network identity, not VMM identity. + + // M2P and forwarding should be cleared since there are no "Joined" + // members remaining. + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("M2P should be cleared when no Joined members remain"); + + // Forwarding should also be cleared when no "Joined" members remain. + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Forwarding should be cleared when no Joined members remain"); + + // Delete the instance, which should trigger group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify M2P and forwarding are cleared. + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + !m2p.contains(&(multicast_ip, underlay_ipv6)), + "M2P mapping should be cleared after group deletion, got: {m2p:?}" + ); + } + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + !fwd.contains_key(&underlay_ipv6), + "Forwarding entry should be cleared after group deletion, \ + got: {fwd:?}" + ); + } + + // Verify MRIB route was withdrawn after group deletion. + assert_mrib_route_absent(cptestctx, multicast_ip).await; +} + +/// Verify M2P and forwarding entries propagate to all sleds, not just the +/// hosting sled. Analogous to `test_instance_v2p_mappings` which verifies +/// V2P mappings on all sleds. +/// +/// Also verifies cleanup: after instance deletion, M2P and forwarding +/// entries are removed from every sled. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_multi_sled_m2p_propagation( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "multi-sled-mcast-project"; + let group_name = "multi-sled-mcast-group"; + let instance_name = "multi-sled-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "multi-sled-mcast-pool", + (224, 160, 0, 1), + (224, 160, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Collect all sled agents (2 total: 1 default + 1 extra). + // We use extra_sled_agents = 1 (not 2) because the gateway sim only + // provides SP data for the two well-known sled UUIDs. A 3rd sled with + // a random UUID would have no SP entry, causing inventory readiness + // to time out. Two sleds is sufficient to verify cross-sled propagation. + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + assert_eq!(all_sled_agents.len(), 2, "expected 2 sled agents"); + + // Create and start an instance. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + // Attach to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6 address for verification. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Look up the hosting sled for subscription verification. + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let hosting_sled_id = info.sled_id; + + // Verify MRIB route was programmed. + assert_mrib_route_exists(cptestctx, multicast_ip).await; + + // M2P and forwarding are pushed to all sleds (like V2P). Any + // instance on any sled may send to a multicast group; without the + // M2P mapping OPTE's overlay layer silently drops the packet. + // Forwarding entries let sender sleds replicate to member sleds. + for (i, sled_agent) in cptestctx.sled_agents.iter().enumerate() { + let agent = sled_agent.sled_agent(); + + // Wait for M2P on every sled. The reconciler may need an + // additional pass after the member reaches "Joined": during + // reconcile_member_states, propagate_m2p_and_forwarding may + // see member_sleds=0 (member still "Joining" in DB), so the + // actual push happens in reconcile_active_groups or the next + // full pass. + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let m2p = agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} should have M2P mapping within timeout: {e:?}") + }); + + // Verify forwarding on every sled. With a single member on + // one sled, the hosting sled's forwarding has no next hops + // (local delivery via subscription). Non-hosting sleds list + // the hosting sled as a next hop so senders can reach it. + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} should have forwarding entry within timeout: {e:?}" + ) + }); + + let fwd = agent.mcast_fwd.lock().unwrap(); + let next_hops = &fwd[&underlay_ipv6]; + // Every sled gets a single next hop pointing at a switch. + // The switch replicates to member sled ports via DPD config. + assert_eq!( + next_hops.len(), + 1, + "Sled {i} should have 1 next_hop (a switch), \ + got: {next_hops:?}" + ); + } + + // Verify per-VMM subscription on the hosting sled only. + // Subscriptions are member-sled-only (not all sleds). + let hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == hosting_sled_id) + .unwrap() + .sled_agent(); + + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let groups = + hosting_agent.instance_multicast_groups.lock().unwrap(); + match groups.get(&instance_id) { + Some(instance_groups) + if instance_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "VMM should be subscribed to {multicast_ip} within timeout: {e:?}" + ) + }); + + // Delete the instance, which triggers group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify MRIB route removed after group deletion. + assert_mrib_route_absent(cptestctx, multicast_ip).await; + + // Verify cleanup on every sled: M2P and forwarding removed. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) + && !fwd.contains_key(&underlay_ipv6) + { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} M2P/forwarding not cleaned up within timeout: {e:?}" + ) + }); + } +} + +/// Verify cross-sled forwarding when members exist on both sleds. +/// +/// With one member on sled A and another on sled B, each sled's forwarding +/// entry should list the other sled as its sole next hop (self-exclusion). +/// This exercises the `.filter(|(id, _)| *id != sled_id)` logic in +/// `converge_forwarding`. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cross_sled_forwarding( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + let project_name = "bidir-fwd-project"; + let group_name = "bidir-fwd-group"; + let instance_a_name = "bidir-instance-a"; + let instance_b_name = "bidir-instance-b"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "bidir-fwd-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let sled_a_id = cptestctx.sled_agents[0].sled_agent_id(); + let sled_b_id = cptestctx.sled_agents[1].sled_agent_id(); + + // Pin instance A to sled A by making sled B non-provisionable. + { + let (authz_sled, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled B non-provisionable"); + } + + let instance_a = instance_for_multicast_groups( + cptestctx, + project_name, + instance_a_name, + true, + &[], + ) + .await; + let instance_a_id = InstanceUuid::from_untyped_uuid(instance_a.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_a_id).await; + + // Verify instance A landed on sled A. + let info_a = nexus + .active_instance_info(&instance_a_id, None) + .await + .unwrap() + .expect("instance A should be running"); + assert_eq!(info_a.sled_id, sled_a_id, "instance A should be on sled A"); + + // Swap provisionability: sled A non-provisionable, sled B provisionable. + { + let (authz_sled_a, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_a_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled A"); + let (authz_sled_b, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_a, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled A non-provisionable"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_b, + nexus_types::external_api::sled::SledProvisionPolicy::Provisionable, + ) + .await + .expect("set sled B provisionable"); + } + + let instance_b = instance_for_multicast_groups( + cptestctx, + project_name, + instance_b_name, + true, + &[], + ) + .await; + + let instance_b_id = InstanceUuid::from_untyped_uuid(instance_b.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_b_id).await; + + // Verify instance B landed on sled B. + let info_b = nexus + .active_instance_info(&instance_b_id, None) + .await + .unwrap() + .expect("instance B should be running"); + + assert_eq!(info_b.sled_id, sled_b_id, "instance B should be on sled B"); + + // Both instances join the same multicast group. + multicast_group_attach( + cptestctx, + project_name, + instance_a_name, + group_name, + ) + .await; + + multicast_group_attach( + cptestctx, + project_name, + instance_b_name, + group_name, + ) + .await; + + wait_for_group_active(client, group_name).await; + + // Wait for both members to reach "Joined". + for instance in [&instance_a, &instance_b] { + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + } + + // Resolve underlay IPv6 for forwarding assertions. + let group_view = get_multicast_group(client, group_name).await; + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, group_view.multicast_ip) + .await + .expect("lookup group by IP"); + + let underlay_group = datastore + .underlay_multicast_group_fetch( + &opctx, + external_group + .underlay_group_id + .expect("active group should have underlay_group_id"), + ) + .await + .expect("fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Verify MRIB route was programmed for the group. + assert_mrib_route_exists(cptestctx, group_view.multicast_ip).await; + + // Wait for forwarding entries on both sleds, then verify each sled's + // forwarding lists exactly the other sled (not itself). + let agent_a = cptestctx.sled_agents[0].sled_agent(); + let agent_b = cptestctx.sled_agents[1].sled_agent(); + + for (label, agent) in [("sled A", &agent_a), ("sled B", &agent_b)] { + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + match fwd.get(&underlay_ipv6) { + Some(hops) if hops.len() == 1 => Ok(()), + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("{label} should have exactly 1 forwarding next_hop: {e:?}") + }); + } + + // Cleanup. + cleanup_instances( + cptestctx, + client, + project_name, + &[instance_a_name, instance_b_name], + ) + .await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify MRIB route removed after group deletion. + assert_mrib_route_absent(cptestctx, group_view.multicast_ip).await; +} + +/// Verify multicast state is re-established after simulated cold start. +/// Analogous to `test_instance_start_creates_networking_state` which tests +/// V2P re-establishment after forcibly clearing sled-agent state. +/// +/// Steps: a) create instance, b) join multicast, c) stop instance, +/// d) forcibly clear all sim sled-agent multicast state, e) restart +/// instance, f) verify M2P, forwarding, and per-VMM subscriptions are +/// re-established. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cold_start_reestablishment( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "cold-start-mcast-project"; + let group_name = "cold-start-mcast-group"; + let instance_name = "cold-start-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "cold-start-mcast-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + + // Create and start an instance, join a multicast group. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Verify MRIB route was programmed. + assert_mrib_route_exists(cptestctx, multicast_ip).await; + + // M2P and forwarding are pushed to all sleds. Verify at least the + // hosting sled has M2P before we clear state. + let pre_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let pre_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == pre_info.sled_id) + .unwrap() + .sled_agent(); + + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let m2p = pre_hosting_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Hosting sled M2P should exist before cold start simulation"); + + // Stop the instance. + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Forcibly clear all sim sled-agent multicast state, simulating a cold + // start where sled-agents lose in-memory state. + for sled_agent in &all_sled_agents { + sled_agent.m2p_mappings.lock().unwrap().clear(); + sled_agent.mcast_fwd.lock().unwrap().clear(); + sled_agent.instance_multicast_groups.lock().unwrap().clear(); + } + + // Restart the instance. + let start_url = + format!("/v1/instances/{instance_name}/start?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &start_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should start instance"); + + // Use `try_instance_simulate` here instead of `instance_wait_for_running_with_simulation` + // because the old VMM may still be draining from the sim collection after + // the stop. `instance_simulate` would panic if it pokes a VMM that was just + // removed; `try_instance_simulate` handles that gracefully. + wait_for_condition( + || async { + let _ = + instance_helpers::try_instance_simulate(nexus, &instance_id) + .await; + + let url = format!("/v1/instances/{instance_id}"); + let instance: Instance = NexusRequest::object_get(client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .map_err(|_| CondCheckError::<()>::NotYet)? + .parsed_body() + .map_err(|_| CondCheckError::<()>::NotYet)?; + + if instance.runtime.run_state == InstanceState::Running { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Instance should reach Running after restart"); + + // Wait for the reconciler to re-establish multicast state. + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Verify MRIB route re-established after cold start. + assert_mrib_route_exists(cptestctx, multicast_ip).await; + + // Verify M2P and forwarding re-established on all sleds. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} M2P not re-established within timeout: {e:?}") + }); + + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} forwarding not re-established within timeout: {e:?}" + ) + }); + } + + // Verify per-VMM subscription on the hosting sled (new propolis_id + // since restart creates a new VMM). + let post_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Restarted instance should have active info"); + + let post_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == post_info.sled_id) + .unwrap() + .sled_agent(); + + activate_then_wait_for_condition( + &cptestctx.lockstep_client, + || async { + let groups = + post_hosting_agent.instance_multicast_groups.lock().unwrap(); + match groups.get(&instance_id) { + Some(instance_groups) + if instance_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Instance should be subscribed to {multicast_ip} after restart: \ + {e:?}" + ) + }); + + // Cleanup. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify MRIB route removed after group deletion. + assert_mrib_route_absent(cptestctx, multicast_ip).await; +} diff --git a/nexus/types/src/deployment/execution/dns.rs b/nexus/types/src/deployment/execution/dns.rs index 009377fd8d9..3730576eda2 100644 --- a/nexus/types/src/deployment/execution/dns.rs +++ b/nexus/types/src/deployment/execution/dns.rs @@ -158,6 +158,7 @@ pub fn blueprint_internal_dns_config( overrides.dendrite_port(scrimlet.id()), overrides.mgs_port(scrimlet.id()), overrides.mgd_port(scrimlet.id()), + overrides.ddm_port(scrimlet.id()), )?; } diff --git a/nexus/types/src/deployment/execution/overridables.rs b/nexus/types/src/deployment/execution/overridables.rs index 881a7c49bdd..7dc3ae0bf4d 100644 --- a/nexus/types/src/deployment/execution/overridables.rs +++ b/nexus/types/src/deployment/execution/overridables.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use omicron_common::address::DDMD_PORT; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::Ipv6Subnet; use omicron_common::address::MGD_PORT; @@ -29,6 +30,8 @@ pub struct Overridables { pub mgs_ports: BTreeMap, /// map: sled id -> TCP port on which that sled's MGD is listening pub mgd_ports: BTreeMap, + /// map: sled id -> TCP port on which that sled's DDM is listening + pub ddm_ports: BTreeMap, /// map: sled id -> IP address of the sled's switch zone pub switch_zone_ips: BTreeMap, } @@ -67,6 +70,16 @@ impl Overridables { self.mgd_ports.get(&sled_id).copied().unwrap_or(MGD_PORT) } + /// Specify the TCP port on which this sled's DDM is listening + pub fn override_ddm_port(&mut self, sled_id: SledUuid, port: u16) { + self.ddm_ports.insert(sled_id, port); + } + + /// Returns the TCP port on which this sled's DDM is listening + pub fn ddm_port(&self, sled_id: SledUuid) -> u16 { + self.ddm_ports.get(&sled_id).copied().unwrap_or(DDMD_PORT) + } + /// Specify the IP address of this switch zone pub fn override_switch_zone_ip( &mut self, diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index c1089e85e20..43653d0878f 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -167,6 +167,14 @@ pub struct MulticastGroupReconcilerStatus { pub members_deleted: usize, /// Number of empty groups marked for deletion (implicit deletion). pub empty_groups_marked: usize, + /// Reconciliation steps skipped this pass because their downstream + /// client was unavailable. Distinguishes "no work needed" (counters + /// at 0, `skipped` empty) from "work was deferred" (counters at 0, + /// step name in `skipped`). + pub skipped: Vec, + /// Number of sleds whose DDM port mapping disagreed with inventory. + /// DDM wins (live state); a non-zero count surfaces inventory lag. + pub ddm_inventory_drift: usize, /// Errors that occurred during reconciliation operations. pub errors: Vec, } diff --git a/openapi/sled-agent/sled-agent-35.0.0-93533c.json.gitstub b/openapi/sled-agent/sled-agent-35.0.0-93533c.json.gitstub new file mode 100644 index 00000000000..00b46848648 --- /dev/null +++ b/openapi/sled-agent/sled-agent-35.0.0-93533c.json.gitstub @@ -0,0 +1 @@ +5579a6d72e5f6be577d2b17ba940ccc0de10decd:openapi/sled-agent/sled-agent-35.0.0-93533c.json diff --git a/openapi/sled-agent/sled-agent-35.0.0-93533c.json b/openapi/sled-agent/sled-agent-36.0.0-7b7885.json similarity index 96% rename from openapi/sled-agent/sled-agent-35.0.0-93533c.json rename to openapi/sled-agent/sled-agent-36.0.0-7b7885.json index b03e32d920e..0a2d8b93339 100644 --- a/openapi/sled-agent/sled-agent-35.0.0-93533c.json +++ b/openapi/sled-agent/sled-agent-36.0.0-7b7885.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "35.0.0" + "version": "36.0.0" }, "paths": { "/artifacts": { @@ -287,6 +287,76 @@ } } }, + "/instances/{instance_id}/multicast-group": { + "put": { + "operationId": "instance_join_multicast_group", + "parameters": [ + { + "in": "path", + "name": "instance_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/InstanceUuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InstanceMulticastMembership" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "operationId": "instance_leave_multicast_group", + "parameters": [ + { + "in": "path", + "name": "instance_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/InstanceUuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InstanceMulticastMembership" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/inventory": { "get": { "summary": "Fetch basic information about this sled", @@ -389,6 +459,162 @@ } } }, + "/networking/mcast-fwd": { + "get": { + "summary": "List multicast forwarding entries present on this sled.", + "operationId": "list_mcast_fwd", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_McastForwardingEntry", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set multicast forwarding entries for an underlay address.", + "operationId": "set_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear multicast forwarding entries for an underlay address.", + "operationId": "clear_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcastForwarding" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/networking/mcast-m2p": { + "get": { + "summary": "List M2P mappings present on this sled.", + "operationId": "list_mcast_m2p", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_Mcast2PhysMapping", + "type": "array", + "items": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "set_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "clear_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcast2Phys" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/omicron-config": { "put": { "operationId": "omicron_config_put", @@ -2293,76 +2519,6 @@ } } }, - "/vmms/{propolis_id}/multicast-group": { - "put": { - "operationId": "vmm_join_multicast_group", - "parameters": [ - { - "in": "path", - "name": "propolis_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/PropolisUuid" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - }, - "delete": { - "operationId": "vmm_leave_multicast_group", - "parameters": [ - { - "in": "path", - "name": "propolis_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/PropolisUuid" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/vmms/{propolis_id}/state": { "get": { "operationId": "vmm_get_state", @@ -3992,6 +4148,40 @@ } ] }, + "ClearMcast2Phys": { + "description": "Clear a mapping from an overlay multicast group to an underlay multicast address.", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "ClearMcastForwarding": { + "description": "Clear all forwarding entries for an underlay multicast address.", + "type": "object", + "properties": { + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "underlay" + ] + }, "CombineError": { "type": "string", "enum": [ @@ -5781,35 +5971,6 @@ "src_propolis_addr" ] }, - "InstanceMulticastBody": { - "description": "Request body for multicast group operations.", - "oneOf": [ - { - "type": "object", - "properties": { - "join": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "join" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "leave": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "leave" - ], - "additionalProperties": false - } - ] - }, "InstanceMulticastMembership": { "description": "Represents a multicast group membership for an instance.\n\nIntroduced in v7.", "type": "object", @@ -6631,6 +6792,151 @@ "minimum": 1, "maximum": 32 }, + "Mcast2PhysMapping": { + "description": "Mapping from an overlay multicast group to an underlay multicast address.\n\nThe underlay address must be within the underlay multicast subnet (ff04::/64). This invariant is enforced by mapping in Nexus, not validated at this layer.", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address (ff04::/64).", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "McastFilterMode": { + "description": "Filter mode for multicast source filtering.", + "oneOf": [ + { + "description": "Accept only packets from listed sources (SSM).", + "type": "string", + "enum": [ + "include" + ] + }, + { + "description": "Accept packets from all sources except those listed. With an empty sources list this is any-source multicast (ASM).", + "type": "string", + "enum": [ + "exclude" + ] + } + ] + }, + "McastForwardingEntry": { + "description": "Forwarding entry for an underlay multicast address, specifying which next hops should receive replicated packets.", + "type": "object", + "properties": { + "next_hops": { + "description": "Next hops with replication and source filter configuration.", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingNextHop" + } + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "next_hops", + "underlay" + ] + }, + "McastForwardingNextHop": { + "description": "A forwarding next hop with replication mode and aggregated source filter.", + "type": "object", + "properties": { + "filter": { + "description": "Aggregated source filter for this destination.", + "allOf": [ + { + "$ref": "#/components/schemas/McastSourceFilter" + } + ] + }, + "next_hop": { + "description": "Unicast IPv6 address of the destination sled.", + "type": "string", + "format": "ipv6" + }, + "replication": { + "description": "Replication mode for this next hop.", + "allOf": [ + { + "$ref": "#/components/schemas/McastReplication" + } + ] + } + }, + "required": [ + "filter", + "next_hop", + "replication" + ] + }, + "McastReplication": { + "description": "Replication mode for multicast forwarding.", + "oneOf": [ + { + "description": "Replicate to front panel ports (egress to external networks).", + "type": "string", + "enum": [ + "external" + ] + }, + { + "description": "Replicate to sled underlay ports.", + "type": "string", + "enum": [ + "underlay" + ] + }, + { + "description": "Replicate to both external and underlay ports.", + "type": "string", + "enum": [ + "both" + ] + } + ] + }, + "McastSourceFilter": { + "description": "Source filter for multicast forwarding.", + "type": "object", + "properties": { + "mode": { + "description": "Filter mode.", + "allOf": [ + { + "$ref": "#/components/schemas/McastFilterMode" + } + ] + }, + "sources": { + "description": "Source addresses to include or exclude.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + } + }, + "required": [ + "mode", + "sources" + ] + }, "Measurement": { "description": "An RoT provided measurement which represents a digest of some component in the trusted computing base (TCB) for the attestor.", "oneOf": [ diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 68fb2ddf57b..e0196fa33ef 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-35.0.0-93533c.json \ No newline at end of file +sled-agent-36.0.0-7b7885.json \ No newline at end of file diff --git a/package-manifest.toml b/package-manifest.toml index d873d147789..b25b9c75a1b 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -683,10 +683,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "4d1f20f793da102b29b914569725ebc9fdf746dd" +source.commit = "c3c3032f8bdc91d6faf2b36e05b8375a0980765c" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "2cb4a97731d55bea78b83aabbba9a43602419e49a9d3eeb214b745463388ff60" +source.sha256 = "751f94de83cf95d2215f3d910dc49bd5c90c18ec6680a9616755bd91fca3a2b1" output.type = "tarball" [package.mg-ddm] @@ -699,10 +699,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "4d1f20f793da102b29b914569725ebc9fdf746dd" +source.commit = "c3c3032f8bdc91d6faf2b36e05b8375a0980765c" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "932cc6149eb87ee9c01226a49708b34fea0479c357f1b19d779f96be40a4c729" +source.sha256 = "4c9b6cf597ec6e26c4f99de82b71482b25cabcd9dd23ccbe87229a997fb6c368" output.type = "zone" output.intermediate_only = true @@ -714,10 +714,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "4d1f20f793da102b29b914569725ebc9fdf746dd" +source.commit = "c3c3032f8bdc91d6faf2b36e05b8375a0980765c" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt -source.sha256 = "dd07d2ea491842cce28fd4eabc0f957f7672a75a8e4d92c31630d4332cb40ebd" +source.sha256 = "f65bf058322013feb2b5771e24046b0c6953d4e9324f8f48374caf7565845851" output.type = "zone" output.intermediate_only = true @@ -765,8 +765,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "1ddaa5d6b101fbaa2c29eca847111cbef1a272ad" -source.sha256 = "d899f9a761bb04bc9b9c88995883196dd691b758de547f7b1836344db5bd5080" +source.commit = "e10e4f5a993fe950ab1b478abb5dcbfa7aa92091" +source.sha256 = "bf93c4d2c6139dca1bf0abab39be25b20b434d998212d08fd6b2df7b015af268" output.type = "zone" output.intermediate_only = true @@ -792,8 +792,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "1ddaa5d6b101fbaa2c29eca847111cbef1a272ad" -source.sha256 = "bb0f7930f3af6679c552e3299cdac82a46866f7f3f38b665c02d5f02464ab7b3" +source.commit = "e10e4f5a993fe950ab1b478abb5dcbfa7aa92091" +source.sha256 = "841a17b2ccfc3e020c1f581f610b852339b038f250b4a3918adc3f34b87d295b" output.type = "zone" output.intermediate_only = true @@ -812,8 +812,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "1ddaa5d6b101fbaa2c29eca847111cbef1a272ad" -source.sha256 = "9e2c578302c3c11763a2a17e6d0b7a65b811ad2458b8a85c65b48fcec0133ab3" +source.commit = "e10e4f5a993fe950ab1b478abb5dcbfa7aa92091" +source.sha256 = "5ae4ab1df725365a5399d295eab84f7b4f21b8157e549d6e85c1811817156d2f" output.type = "zone" output.intermediate_only = true diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index f261f89db87..fdc8ae39200 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -19,6 +19,10 @@ use omicron_common::api::internal::{ SledIdentifiers, VirtualNetworkInterfaceHost, }, }; +use sled_agent_types_versions::latest::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types_versions::{ latest, v1, v4, v6, v7, v9, v10, v11, v12, v14, v16, v17, v18, v20, v22, v24, v25, v26, v28, v29, v30, v31, v33, @@ -38,6 +42,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (36, MCAST_M2P_FORWARDING), (35, INLINE_ROUTER_PEER_IP_ADDR), (34, MODIFY_SVCS_TYPES), (33, BOOTSTORE_SERVICE_NAT), @@ -631,25 +636,57 @@ pub trait SledAgentApi { ) -> Result; #[endpoint { + method = PUT, + path = "/instances/{instance_id}/multicast-group", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn instance_join_multicast_group( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result; + + #[endpoint { + method = DELETE, + path = "/instances/{instance_id}/multicast-group", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn instance_leave_multicast_group( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result; + + /// Join a multicast group. + // + // Deprecated. This was keyed by the active VMM's Propolis ID, while + // newer versions use an instance-scoped endpoint. + #[endpoint { + operation_id = "vmm_join_multicast_group", method = PUT, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, }] - async fn vmm_join_multicast_group( + async fn vmm_join_multicast_group_v7( rqctx: RequestContext, - path_params: Path, - body: TypedBody, + path_params: Path, + body: TypedBody, ) -> Result; + /// Leave a multicast group. + // + // Deprecated. This was keyed by the active VMM's Propolis ID, while + // newer versions use an instance-scoped endpoint. #[endpoint { + operation_id = "vmm_leave_multicast_group", method = DELETE, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, }] - async fn vmm_leave_multicast_group( + async fn vmm_leave_multicast_group_v7( rqctx: RequestContext, - path_params: Path, - body: TypedBody, + path_params: Path, + body: TypedBody, ) -> Result; #[endpoint { @@ -808,6 +845,70 @@ pub trait SledAgentApi { rqctx: RequestContext, ) -> Result>, HttpError>; + /// Set a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = PUT, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = DELETE, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Set multicast forwarding entries for an underlay address. + #[endpoint { + method = PUT, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear multicast forwarding entries for an underlay address. + #[endpoint { + method = DELETE, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// List M2P mappings present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + /// List multicast forwarding entries present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError>; + #[endpoint { method = POST, path = "/switch-ports", diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 2eb525d3e53..0c49cf4c67d 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -45,12 +45,17 @@ use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::SledVmmState; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, - VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, - VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, - VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, + InstancePathParam, VmmIssueDiskSnapshotRequestBody, + VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, + VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, + VpcPathParam, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig}; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types::probes::ProbeSet; use sled_agent_types::rot::{ Attestation, CertificateChain, MeasurementLog, Nonce, RotPathParams, @@ -79,7 +84,7 @@ use trust_quorum_types::messages::{ use trust_quorum_types::status::{CommitStatus, CoordinatorStatus, NodeStatus}; // Fixed identifiers for prior versions only -use sled_agent_types_versions::{v1, v20, v25, v26, v30, v33}; +use sled_agent_types_versions::{v1, v7, v20, v25, v26, v30, v33}; use sled_diagnostics::{ SledDiagnosticsCommandHttpOutput, SledDiagnosticsQueryOutput, }; @@ -707,33 +712,96 @@ impl SledAgentApi for SledAgentImpl { .await } - async fn vmm_join_multicast_group( + async fn instance_join_multicast_group( rqctx: RequestContext, - path_params: Path, - body: TypedBody, + path_params: Path, + body: TypedBody, ) -> Result { let sa = rqctx.context(); - let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let instance_id = path_params.into_inner().instance_id; + let membership = body.into_inner(); sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_join_multicast_group(id, &body_args).await?; + sa.instance_join_multicast_group(instance_id, &membership) + .await?; Ok(HttpResponseUpdatedNoContent()) }) .await } - async fn vmm_leave_multicast_group( + async fn instance_leave_multicast_group( rqctx: RequestContext, - path_params: Path, - body: TypedBody, + path_params: Path, + body: TypedBody, ) -> Result { let sa = rqctx.context(); - let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let instance_id = path_params.into_inner().instance_id; + let membership = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.instance_leave_multicast_group(instance_id, &membership) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn vmm_join_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let propolis_id = path_params.into_inner().propolis_id; + let membership = match body.into_inner() { + v7::instance::InstanceMulticastBody::Join(m) => m, + v7::instance::InstanceMulticastBody::Leave(_) => { + return Err(HttpError::for_bad_request( + None, + "Join endpoint cannot process Leave operations".to_string(), + )); + } + }; + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let Some(instance_id) = + sa.instance_id_for_propolis(propolis_id).await? + else { + // No registered VMM means no OPTE port to update. + return Ok(HttpResponseUpdatedNoContent()); + }; + sa.instance_join_multicast_group(instance_id, &membership) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn vmm_leave_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let propolis_id = path_params.into_inner().propolis_id; + let membership = match body.into_inner() { + v7::instance::InstanceMulticastBody::Leave(m) => m, + v7::instance::InstanceMulticastBody::Join(_) => { + return Err(HttpError::for_bad_request( + None, + "Leave endpoint cannot process Join operations".to_string(), + )); + } + }; sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_leave_multicast_group(id, &body_args).await?; + let Some(instance_id) = + sa.instance_id_for_propolis(propolis_id).await? + else { + return Ok(HttpResponseUpdatedNoContent()); + }; + sa.instance_leave_multicast_group(instance_id, &membership) + .await?; Ok(HttpResponseUpdatedNoContent()) }) .await @@ -932,6 +1000,86 @@ impl SledAgentApi for SledAgentImpl { .await } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let m2p = sa.list_mcast_m2p().await.map_err(Error::from)?; + Ok(HttpResponseOk(m2p)) + }) + .await + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let fwd = sa.list_mcast_fwd().await.map_err(Error::from)?; + Ok(HttpResponseOk(fwd)) + }) + .await + } + async fn uplink_ensure( rqctx: RequestContext, body: TypedBody, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index a009e7f3843..bf70e3e9448 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -2325,7 +2325,11 @@ impl InstanceRunner { // for them. let mut opte_ports = Vec::with_capacity(self.requested_nics.len()); let mut opte_port_names = Vec::with_capacity(self.requested_nics.len()); + let mcast_cfg = self.multicast_group_cfgs(); for nic in self.requested_nics.iter() { + // Multicast subscriptions target the primary NIC only. + // See the TODO on ensure_multicast_groups. + let groups: &[_] = if nic.primary { &mcast_cfg } else { &[] }; let port = self.port_manager.create_port(PortCreateParams { nic, external_ips: &self.external_ips, @@ -2337,6 +2341,7 @@ impl InstanceRunner { .copied() .map(Into::into) .collect(), + multicast_groups: groups, })?; opte_port_names.push(port.0.name().to_string()); opte_ports.push(port); @@ -2618,12 +2623,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to add_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring add_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.join_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - self.multicast_groups.retain(|m| m != membership); + self.multicast_groups = old_groups; } out } @@ -2632,14 +2638,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to delete_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring delete_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.leave_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - readd the membership if it was removed - if !self.multicast_groups.contains(membership) { - self.multicast_groups.push(membership.clone()); - } + self.multicast_groups = old_groups; } out } @@ -2648,48 +2653,54 @@ impl InstanceRunner { self.refresh_multicast_groups_inner() } - async fn join_multicast_group_inner( - &mut self, - membership: &InstanceMulticastMembership, - ) -> Result<(), Error> { - // Check for duplicate membership (idempotency) - if self.multicast_groups.contains(membership) { - return Ok(()); - } - - // Add to local state - self.multicast_groups.push(membership.clone()); + /// Convert `InstanceMulticastMembership` list to OPTE + /// `MulticastGroupCfg` list. + fn multicast_group_cfgs( + &self, + ) -> Vec { + self.multicast_groups + .iter() + .map(|m| illumos_utils::opte::MulticastGroupCfg { + group_ip: m.group_ip, + sources: m.sources.clone(), + }) + .collect() + } - // Update OPTE configuration + /// Sync the current multicast group memberships to OPTE via the + /// port manager. + /// + // TODO: subscriptions target the primary NIC only. + // InstanceMulticastMembership carries no NIC identifier, same as + // external IPs and attached subnets (though not firewall rules, + // which fan out across all VPC ports by VNI). If per-NIC multicast + // is needed, the membership type needs a NIC field and both this + // function and setup_propolis_zone must be updated. + fn ensure_multicast_groups(&self) -> Result<(), Error> { let Some(primary_nic) = self.primary_nic() else { return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); }; - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - // Validate multicast configuration with OPTE self.port_manager.multicast_groups_ensure( primary_nic.id, primary_nic.kind, - &multicast_cfg, + &self.multicast_group_cfgs(), )?; - // TODO: Configure underlay multicast group addresses on the zone's vNIC. - // This should add the multicast group addresses to the zone's network - // interface so it can receive underlay multicast traffic (physical - // network layer). Rack-wide dataplane forwarding is handled by the - // RPW reconciler + DPD. - // See also: port_manager.rs multicast_groups_ensure() TODO about - // configuring OPTE port-level multicast group membership. + Ok(()) + } + async fn join_multicast_group_inner( + &mut self, + membership: &InstanceMulticastMembership, + ) -> Result<(), Error> { + // Idempotent -> skip if already subscribed. + if self.multicast_groups.contains(membership) { + return Ok(()); + } + + self.multicast_groups.push(membership.clone()); + self.ensure_multicast_groups()?; Ok(()) } @@ -2697,56 +2708,12 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Remove from local state self.multicast_groups.retain(|m| m != membership); - - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; - - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; - - Ok(()) + self.ensure_multicast_groups() } fn refresh_multicast_groups_inner(&mut self) -> Result<(), Error> { - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; - - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; - - Ok(()) + self.ensure_multicast_groups() } } @@ -3036,6 +3003,7 @@ mod tests { let port_manager = PortManager::new( log.new(o!("component" => "PortManager")), Ipv6Addr::new(0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01), + &[], ); let cleanup_context = CleanupContext::default(); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index f110a379470..012d0a45df0 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -18,7 +18,7 @@ use illumos_utils::opte::PortManager; use illumos_utils::running_zone::ZoneBuilderFactory; use omicron_common::api::external::ByteCount; use omicron_common::api::internal::shared::SledIdentifiers; -use omicron_uuid_kinds::PropolisUuid; +use omicron_uuid_kinds::{InstanceUuid, PropolisUuid}; use oxnet::IpNet; use sled_agent_config_reconciler::AvailableDatasetsReceiver; use sled_agent_config_reconciler::CurrentlyManagedZpoolsReceiver; @@ -43,6 +43,9 @@ pub enum Error { #[error("VMM with ID {0} not found")] NoSuchVmm(PropolisUuid), + #[error("No active VMM for instance {0}")] + NoActiveVmmForInstance(InstanceUuid), + #[error("OPTE port management error")] Opte(#[from] illumos_utils::opte::Error), @@ -303,17 +306,22 @@ impl InstanceManager { rx.await? } - pub async fn join_multicast_group( + /// Subscribe an instance's active VMM OPTE port to a multicast group. + /// + /// The active Propolis ID is resolved inside the manager's run loop so + /// that the lookup and the OPTE dispatch are serialized with other + /// per-instance state changes. + pub async fn join_multicast_group_by_instance( &self, - propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + instance_id: InstanceUuid, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx - .send(InstanceManagerRequest::JoinMulticastGroup { - propolis_id, - multicast_body: multicast_body.clone(), + .send(InstanceManagerRequest::JoinMulticastGroupByInstance { + instance_id, + membership: membership.clone(), tx, }) .await @@ -322,17 +330,22 @@ impl InstanceManager { rx.await? } - pub async fn leave_multicast_group( + /// Unsubscribe an instance's active VMM OPTE port from a multicast group. + /// + /// The active Propolis ID is resolved inside the manager's run loop so + /// that the lookup and the OPTE dispatch are serialized with other + /// per-instance state changes. + pub async fn leave_multicast_group_by_instance( &self, - propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + instance_id: InstanceUuid, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx - .send(InstanceManagerRequest::LeaveMulticastGroup { - propolis_id, - multicast_body: multicast_body.clone(), + .send(InstanceManagerRequest::LeaveMulticastGroupByInstance { + instance_id, + membership: membership.clone(), tx, }) .await @@ -341,6 +354,27 @@ impl InstanceManager { rx.await? } + /// Resolve a Propolis ID to its registered instance ID. + /// + /// # Returns + /// + /// `Ok(None)` if no instance is registered with that Propolis ID. + pub async fn instance_id_for_propolis( + &self, + propolis_id: PropolisUuid, + ) -> Result, Error> { + let (tx, rx) = oneshot::channel(); + self.inner + .tx + .send(InstanceManagerRequest::LookupInstanceForPropolis { + propolis_id, + tx, + }) + .await + .map_err(|_| Error::FailedSendInstanceManagerClosed)?; + rx.await? + } + /// Returns the last-set size of the reservoir pub fn reservoir_size(&self) -> ByteCount { self.inner.vmm_reservoir_manager.reservoir_size() @@ -482,16 +516,20 @@ enum InstanceManagerRequest { RefreshExternalIps { tx: oneshot::Sender>, }, - JoinMulticastGroup { - propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + JoinMulticastGroupByInstance { + instance_id: InstanceUuid, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, - LeaveMulticastGroup { - propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + LeaveMulticastGroupByInstance { + instance_id: InstanceUuid, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, + LookupInstanceForPropolis { + propolis_id: PropolisUuid, + tx: oneshot::Sender, Error>>, + }, GetState { propolis_id: PropolisUuid, tx: oneshot::Sender>, @@ -630,11 +668,14 @@ impl InstanceManagerRunner { Some(RefreshExternalIps { tx }) => { self.refresh_external_ips(tx) }, - Some(JoinMulticastGroup { propolis_id, multicast_body, tx }) => { - self.join_multicast_group(tx, propolis_id, &multicast_body) - }, - Some(LeaveMulticastGroup { propolis_id, multicast_body, tx }) => { - self.leave_multicast_group(tx, propolis_id, &multicast_body) + Some(JoinMulticastGroupByInstance { instance_id, membership, tx }) => { + self.join_multicast_group_by_instance(tx, instance_id, &membership) + } + Some(LeaveMulticastGroupByInstance { instance_id, membership, tx }) => { + self.leave_multicast_group_by_instance(tx, instance_id, &membership) + } + Some(LookupInstanceForPropolis { propolis_id, tx }) => { + self.lookup_instance_for_propolis(tx, propolis_id) } Some(GetState { propolis_id, tx }) => { // TODO(eliza): it could potentially be nice to @@ -903,48 +944,66 @@ impl InstanceManagerRunner { Ok(()) } - fn join_multicast_group( + /// Resolve the active VMM for `instance_id` and forward a join to its + /// instance task. The lookup runs inside this dispatcher loop, so it is + /// serialized with `EnsureRegistered`, `EnsureUnregistered`, and other + /// state changes for the same instance. If no active VMM is registered + /// the call is a noop, then there is no OPTE port to update. + fn join_multicast_group_by_instance( &self, tx: oneshot::Sender>, - propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + instance_id: InstanceUuid, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - let Some(instance) = self.get_propolis(propolis_id) else { - return Err(Error::NoSuchVmm(propolis_id)); + let Some(instance) = self.find_instance(instance_id) else { + return tx.send(Ok(())).map_err(|_| Error::FailedSendClientClosed); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.join_multicast_group(tx, membership)?; Ok(()) } - fn leave_multicast_group( + /// Resolve the active VMM for `instance_id` and forward a leave to its + /// instance task. See [`Self::join_multicast_group_by_instance`] for the + /// serialization and no-active-VMM behavior. + fn leave_multicast_group_by_instance( &self, tx: oneshot::Sender>, - propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + instance_id: InstanceUuid, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - let Some(instance) = self.get_propolis(propolis_id) else { - return Err(Error::NoSuchVmm(propolis_id)); + let Some(instance) = self.find_instance(instance_id) else { + return tx.send(Ok(())).map_err(|_| Error::FailedSendClientClosed); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.leave_multicast_group(tx, membership)?; Ok(()) } + /// Locate the active VMM whose instance ID matches `instance_id`. + /// + /// The instance manager indexes by Propolis ID, so this is a linear + /// scan over the active jobs. The dispatcher serializes calls, so the + /// scan runs without any external lock contention. + fn find_instance(&self, instance_id: InstanceUuid) -> Option<&Instance> { + self.jobs.values().find(|i| i.id() == instance_id) + } + + /// Look up the instance ID for a registered Propolis ID. + /// + /// Runs inside the dispatcher loop, so the lookup is serialized with + /// other per-instance state changes. + /// + /// # Returns + /// + /// `Ok(None)` if no instance is registered with that Propolis ID. + fn lookup_instance_for_propolis( + &self, + tx: oneshot::Sender, Error>>, + propolis_id: PropolisUuid, + ) -> Result<(), Error> { + let result = self.get_propolis(propolis_id).map(|i| i.id()); + tx.send(Ok(result)).map_err(|_| Error::FailedSendClientClosed) + } + fn get_instance_state( &self, tx: oneshot::Sender>, diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs index 2f2d5421204..38c0d0eba06 100644 --- a/sled-agent/src/probe_manager.rs +++ b/sled-agent/src/probe_manager.rs @@ -382,6 +382,7 @@ impl ProbeManagerInner { // but probes are supposed to mimic instances as closely as // possible. We should consider if we want to support them here. attached_subnets: vec![], + multicast_groups: &[], })?; let installed_zone = ZoneBuilderFactory::new() diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 8cc6b6dd63a..aaf21f37d9b 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -29,10 +29,10 @@ use nexus_types::deployment::{ }; use nexus_types::external_api::sled::SledState; use omicron_common::address::{ - CP_SERVICES_RESERVED_ADDRESSES, DENDRITE_PORT, DNS_HTTP_PORT, DNS_PORT, - Ipv6Subnet, MGD_PORT, MGS_PORT, NEXUS_INTERNAL_PORT, NEXUS_LOCKSTEP_PORT, - NTP_PORT, NUM_SOURCE_NAT_PORTS, REPO_DEPOT_PORT, ReservedRackSubnet, - SLED_PREFIX, SLED_RESERVED_ADDRESSES, get_sled_address, + CP_SERVICES_RESERVED_ADDRESSES, DDMD_PORT, DENDRITE_PORT, DNS_HTTP_PORT, + DNS_PORT, Ipv6Subnet, MGD_PORT, MGS_PORT, NEXUS_INTERNAL_PORT, + NEXUS_LOCKSTEP_PORT, NTP_PORT, NUM_SOURCE_NAT_PORTS, REPO_DEPOT_PORT, + ReservedRackSubnet, SLED_PREFIX, SLED_RESERVED_ADDRESSES, get_sled_address, get_switch_zone_address, }; use omicron_common::api::external::{Generation, MacAddr, Vni}; @@ -341,6 +341,7 @@ impl Plan { DENDRITE_PORT, MGS_PORT, MGD_PORT, + DDMD_PORT, ) .unwrap(); } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 82d2fc23a4e..c2993101644 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1179,6 +1179,7 @@ impl ServiceManager { dhcp_config: DhcpCfg::default(), // Services do not use attached subnets, only instances. attached_subnets: vec![], + multicast_groups: &[], }) .map_err(|err| Error::ServicePortCreation { service: zone_kind, diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 84a31190cb8..935566bb242 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -236,18 +236,15 @@ impl SimCollection { while should_step { let (new_state, to_destroy) = { - // The object must be present in `objects` because it only gets - // removed when it comes to rest in the "Destroyed" state, but - // we can only get here if there's an asynchronous state - // transition desired. - // // We do as little as possible with the lock held. In // particular, we want to finish this work before calling out to // notify the nexus. let mut objects = self.objects.lock().await; + + // The object may already have been destroyed and removed by a + // concurrent poke (e.g., sim_step racing with an explicit poke + // from a test). In that case there is nothing left to do. let Some(mut object) = objects.remove(&id) else { - // Instance was already removed (e.g., destroyed by a - // concurrent transition). Nothing left to do. break; }; object.transition_finish(); diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 8e65a9a4574..7e6c47eaa31 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -56,12 +56,17 @@ use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::SledVmmState; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, - VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, - VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, - VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, + InstancePathParam, VmmIssueDiskSnapshotRequestBody, + VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, + VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, + VpcPathParam, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig}; +use sled_agent_types::multicast::ClearMcast2Phys; +use sled_agent_types::multicast::ClearMcastForwarding; +use sled_agent_types::multicast::Mcast2PhysMapping; +use sled_agent_types::multicast::McastForwardingEntry; use sled_agent_types::probes::ProbeSet; use sled_agent_types::rot::{ Attestation, CertificateChain, MeasurementLog, Nonce, RotPathParams, @@ -84,6 +89,7 @@ use sled_agent_types::zone_bundle::{ use sled_hardware_types::BaseboardId; // Fixed identifiers for prior versions only use sled_agent_types_versions::v1; +use sled_agent_types_versions::v7; use sled_agent_types_versions::v20; use sled_agent_types_versions::v25; use sled_agent_types_versions::v26; @@ -188,56 +194,46 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseUpdatedNoContent()) } - async fn vmm_join_multicast_group( + async fn instance_join_multicast_group( rqctx: RequestContext, - path_params: Path, - body: TypedBody, + path_params: Path, + body: TypedBody, ) -> Result { let sa = rqctx.context(); - let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); - - match body_args { - InstanceMulticastBody::Join(membership) => { - sa.instance_join_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - return Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations".to_string(), - )); - } - } - + let instance_id = path_params.into_inner().instance_id; + let membership = body.into_inner(); + sa.instance_join_multicast_group(instance_id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) } - async fn vmm_leave_multicast_group( + async fn instance_leave_multicast_group( rqctx: RequestContext, - path_params: Path, - body: TypedBody, + path_params: Path, + body: TypedBody, ) -> Result { let sa = rqctx.context(); - let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let instance_id = path_params.into_inner().instance_id; + let membership = body.into_inner(); + sa.instance_leave_multicast_group(instance_id, &membership).await?; + Ok(HttpResponseUpdatedNoContent()) + } - match body_args { - InstanceMulticastBody::Leave(membership) => { - sa.instance_leave_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - return Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations".to_string(), - )); - } - } + // v7 shims exist on the trait for spec compatibility. The sim has no + // caller for them. + async fn vmm_join_multicast_group_v7( + _rqctx: RequestContext, + _path_params: Path, + _body: TypedBody, + ) -> Result { + unimplemented!() + } - Ok(HttpResponseUpdatedNoContent()) + async fn vmm_leave_multicast_group_v7( + _rqctx: RequestContext, + _path_params: Path, + _body: TypedBody, + ) -> Result { + unimplemented!() } async fn disk_put( @@ -390,6 +386,66 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseOk(vnics)) } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let m2p = sa + .list_mcast_m2p() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(m2p)) + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let fwd = sa + .list_mcast_fwd() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(fwd)) + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 384a1ed6ea7..5f8f1446b1c 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -38,8 +38,8 @@ use omicron_common::disk::{ DisksManagementResult, OmicronPhysicalDisksConfig, }; use omicron_uuid_kinds::{ - DatasetUuid, GenericUuid, PhysicalDiskUuid, PropolisUuid, SledUuid, - SupportBundleUuid, ZpoolUuid, + DatasetUuid, GenericUuid, InstanceUuid, PhysicalDiskUuid, PropolisUuid, + SledUuid, SupportBundleUuid, ZpoolUuid, }; use oxnet::{IpNet, Ipv6Net}; use propolis_client::instance_spec::FileStorageBackend; @@ -66,6 +66,10 @@ use sled_agent_types::inventory::{ OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZonesConfig, SingleMeasurementInventory, SledRole, ZpoolHealth, }; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, McastForwardingNextHop, +}; use sled_agent_types::support_bundle::SupportBundleMetadata; use sled_agent_types::system_networking::SystemNetworkingConfig; @@ -96,6 +100,8 @@ pub struct SledAgent { pub nexus_client: Arc, pub simulated_upstairs: Arc, pub v2p_mappings: Mutex>, + pub m2p_mappings: Mutex>, + pub mcast_fwd: Mutex>>, mock_propolis: futures::lock::Mutex< Option<(propolis_mock_server::Server, PropolisClient)>, >, @@ -105,9 +111,9 @@ pub struct SledAgent { /// subnets attached to instances. pub attached_subnets: Mutex>>, - /// multicast group memberships for instances - pub multicast_groups: - Mutex>>, + /// multicast group memberships, keyed by instance. + pub instance_multicast_groups: + Mutex>>, pub vpc_routes: Mutex>, config: Config, fake_zones: Mutex, @@ -188,9 +194,11 @@ impl SledAgent { nexus_client, simulated_upstairs, v2p_mappings: Mutex::new(HashSet::new()), + m2p_mappings: Mutex::new(HashSet::new()), + mcast_fwd: Mutex::new(HashMap::new()), external_ips: Mutex::new(HashMap::new()), attached_subnets: Mutex::new(HashMap::new()), - multicast_groups: Mutex::new(HashMap::new()), + instance_multicast_groups: Mutex::new(HashMap::new()), vpc_routes: Mutex::new(HashMap::new()), mock_propolis: futures::lock::Mutex::new(None), config: config.clone(), @@ -676,6 +684,58 @@ impl SledAgent { Ok(Vec::from_iter(v2p_mappings.clone())) } + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.insert((req.group, req.underlay)); + Ok(()) + } + + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.remove(&(req.group, req.underlay)); + Ok(()) + } + + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.insert(req.underlay, req.next_hops.clone()); + Ok(()) + } + + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.remove(&req.underlay); + Ok(()) + } + + pub fn list_mcast_m2p(&self) -> Result, Error> { + let m2p = self.m2p_mappings.lock().unwrap(); + Ok(m2p + .iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: *group, + underlay: *underlay, + }) + .collect()) + } + + pub fn list_mcast_fwd(&self) -> Result, Error> { + let fwd = self.mcast_fwd.lock().unwrap(); + Ok(fwd + .iter() + .map(|(underlay, next_hops)| McastForwardingEntry { + underlay: *underlay, + next_hops: next_hops.clone(), + }) + .collect()) + } + pub async fn instance_put_external_ip( &self, propolis_id: PropolisUuid, @@ -801,39 +861,21 @@ impl SledAgent { pub async fn instance_join_multicast_group( &self, - propolis_id: PropolisUuid, + instance_id: InstanceUuid, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - if !self.vmms.contains_key(&propolis_id.into_untyped_uuid()).await { - return Err(Error::internal_error( - "can't join multicast group for VMM that's not registered", - )); - } - - let mut groups = self.multicast_groups.lock().unwrap(); - let my_groups = groups.entry(propolis_id).or_default(); - - my_groups.insert(membership.clone()); - + let mut groups = self.instance_multicast_groups.lock().unwrap(); + groups.entry(instance_id).or_default().insert(membership.clone()); Ok(()) } pub async fn instance_leave_multicast_group( &self, - propolis_id: PropolisUuid, + instance_id: InstanceUuid, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - if !self.vmms.contains_key(&propolis_id.into_untyped_uuid()).await { - return Err(Error::internal_error( - "can't leave multicast group for VMM that's not registered", - )); - } - - let mut groups = self.multicast_groups.lock().unwrap(); - let my_groups = groups.entry(propolis_id).or_default(); - - my_groups.remove(membership); - + let mut groups = self.instance_multicast_groups.lock().unwrap(); + groups.entry(instance_id).or_default().remove(membership); Ok(()) } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index e0be3e87daa..17ab7857c1b 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -56,7 +56,7 @@ use omicron_common::api::internal::shared::{ use omicron_common::zpool_name::ZpoolName; use omicron_ddm_admin_client::Client as DdmAdminClient; use omicron_uuid_kinds::{ - GenericUuid, MupdateOverrideUuid, PropolisUuid, SledUuid, + GenericUuid, InstanceUuid, MupdateOverrideUuid, PropolisUuid, SledUuid, }; use oximeter_instruments::http::LatencyTracker; use oxnet::IpNet; @@ -75,11 +75,15 @@ use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::instance::ResolvedVpcFirewallRule; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, SledVmmState, VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig, SledRole}; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types::probes::ProbeCreate; use sled_agent_types::resolvable_files::{ PreparedOmicronZone, RemoveMupdateOverrideResult, ResolverStatus, @@ -411,7 +415,6 @@ struct SledAgentInner { // A handle to the trust quorum. trust_quorum: trust_quorum::NodeTaskHandle, - // A handle to the hardware monitor. hardware_monitor: HardwareMonitorHandle, @@ -580,6 +583,7 @@ impl SledAgent { let port_manager = PortManager::new( parent_log.new(o!("component" => "PortManager")), *sled_address.ip(), + &underlay_nics, ); // The VMM reservoir is configured with respect to what's left after @@ -1034,30 +1038,56 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } + /// Subscribe an instance's active VMM OPTE port to a multicast group. + /// + /// The active Propolis ID is resolved inside the instance manager so + /// that resolution and dispatch are serialized with other per-instance + /// state changes. A no-active-VMM result is treated as a successful + /// no-op since the OPTE port no longer exists. pub async fn instance_join_multicast_group( &self, - propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + instance_id: InstanceUuid, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .join_multicast_group(propolis_id, multicast_body) + .join_multicast_group_by_instance(instance_id, membership) .await .map_err(|e| Error::Instance(e)) } + /// Unsubscribe an instance's active VMM OPTE port from a multicast group. + /// + /// See [`Self::instance_join_multicast_group`] for the resolution and + /// no-active-VMM semantics. pub async fn instance_leave_multicast_group( &self, - propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + instance_id: InstanceUuid, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .leave_multicast_group(propolis_id, multicast_body) + .leave_multicast_group_by_instance(instance_id, membership) .await .map_err(|e| Error::Instance(e)) } + /// Resolve a Propolis ID to its registered instance ID. + /// + /// # Returns + /// + /// `Ok(None)` if no instance is registered with that Propolis ID. + pub async fn instance_id_for_propolis( + &self, + propolis_id: PropolisUuid, + ) -> Result, Error> { + self.inner + .instances + .instance_id_for_propolis(propolis_id) + .await + .map_err(Error::Instance) + } + /// Returns the state of the instance with the provided ID. pub async fn instance_get_state( &self, @@ -1138,6 +1168,52 @@ impl SledAgent { .map_err(Error::from) } + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + pub async fn set_mcast_m2p( + &self, + req: &Mcast2PhysMapping, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_m2p(req).map_err(Error::from) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + pub async fn clear_mcast_m2p( + &self, + req: &ClearMcast2Phys, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_m2p(req).map_err(Error::from) + } + + /// Set multicast forwarding next hops for an underlay group address. + pub async fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_fwd(req).map_err(Error::from) + } + + /// Remove multicast forwarding entries for an underlay group address. + pub async fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_fwd(req).map_err(Error::from) + } + + /// List all multicast M2P mappings from OPTE. + pub async fn list_mcast_m2p( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_m2p().map_err(Error::from) + } + + /// List all multicast forwarding entries from OPTE. + pub async fn list_mcast_fwd( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_fwd().map_err(Error::from) + } + pub async fn ensure_scrimlet_host_ports( &self, uplinks: Vec, diff --git a/sled-agent/types/src/lib.rs b/sled-agent/types/src/lib.rs index 2d87bbc1761..a1885b4aa66 100644 --- a/sled-agent/types/src/lib.rs +++ b/sled-agent/types/src/lib.rs @@ -16,6 +16,7 @@ pub mod early_networking; pub mod firewall_rules; pub mod instance; pub mod inventory; +pub mod multicast; pub mod probes; pub mod rack_init; pub mod resolvable_files; diff --git a/sled-agent/types/src/multicast.rs b/sled-agent/types/src/multicast.rs new file mode 100644 index 00000000000..27e95a0d94c --- /dev/null +++ b/sled-agent/types/src/multicast.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Multicast networking types for the Sled Agent API. + +pub use sled_agent_types_versions::latest::multicast::*; diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index a89d8fedf2b..d988af59722 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -89,6 +89,18 @@ pub mod firewall_rules { pub use crate::v31::firewall_rules::VpcFirewallRulesEnsureBody; } +pub mod multicast { + pub use crate::v36::multicast::ClearMcast2Phys; + pub use crate::v36::multicast::ClearMcastForwarding; + pub use crate::v36::multicast::Mcast2PhysMapping; + pub use crate::v36::multicast::McastFilterMode; + pub use crate::v36::multicast::McastForwardingEntry; + pub use crate::v36::multicast::McastForwardingNextHop; + pub use crate::v36::multicast::McastReplication; + pub use crate::v36::multicast::McastSourceFilter; + pub use crate::v36::multicast::MulticastGroupCfg; +} + pub mod instance { pub use crate::v1::instance::InstanceExternalIpBody; pub use crate::v1::instance::InstanceMetadata; @@ -121,6 +133,8 @@ pub mod instance { pub use crate::v32::instance::ExternalIpv6Config; pub use crate::v32::instance::InstanceEnsureBody; pub use crate::v32::instance::InstanceSledLocalConfig; + + pub use crate::v36::instance::InstancePathParam; } pub mod inventory { diff --git a/sled-agent/types/versions/src/lib.rs b/sled-agent/types/versions/src/lib.rs index 69a6e70fcd9..5a2e2fe7cb3 100644 --- a/sled-agent/types/versions/src/lib.rs +++ b/sled-agent/types/versions/src/lib.rs @@ -79,6 +79,8 @@ pub mod v32; pub mod v33; #[path = "modify_svcs_types/mod.rs"] pub mod v34; +#[path = "mcast_m2p_forwarding/mod.rs"] +pub mod v36; #[path = "add_nexus_lockstep_port_to_inventory/mod.rs"] pub mod v4; #[path = "add_probe_put_endpoint/mod.rs"] diff --git a/sled-agent/types/versions/src/mcast_m2p_forwarding/instance.rs b/sled-agent/types/versions/src/mcast_m2p_forwarding/instance.rs new file mode 100644 index 00000000000..15d0b48417b --- /dev/null +++ b/sled-agent/types/versions/src/mcast_m2p_forwarding/instance.rs @@ -0,0 +1,13 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_uuid_kinds::InstanceUuid; +use schemars::JsonSchema; +use serde::Deserialize; + +/// Path parameters for an instance-scoped request. +#[derive(Deserialize, JsonSchema)] +pub struct InstancePathParam { + pub instance_id: InstanceUuid, +} diff --git a/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs b/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs new file mode 100644 index 00000000000..6914c2d856b --- /dev/null +++ b/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs @@ -0,0 +1,11 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Version `MCAST_M2P_FORWARDING` of the Sled Agent API. +//! +//! Adds multicast-to-physical mapping and forwarding types, and moves the +//! multicast subscription endpoints from VMM-keyed to instance-keyed. + +pub mod instance; +pub mod multicast; diff --git a/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs b/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs new file mode 100644 index 00000000000..5c2247c1159 --- /dev/null +++ b/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs @@ -0,0 +1,132 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2026 Oxide Computer Company + +//! Multicast networking types for the sled-agent API. +//! +//! These types support overlay-to-underlay multicast mapping and +//! multicast forwarding configuration via OPTE. The underlay address +//! space is ff04::/64, a subset of admin-local scope per +//! [RFC 7346](https://www.rfc-editor.org/rfc/rfc7346). + +use std::net::IpAddr; +use std::net::Ipv6Addr; + +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; + +/// Mapping from an overlay multicast group to an underlay multicast +/// address. +/// +/// The underlay address must be within the underlay multicast subnet +/// (ff04::/64). This invariant is enforced by mapping in Nexus, not +/// validated at this layer. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct Mcast2PhysMapping { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address (ff04::/64). + pub underlay: Ipv6Addr, +} + +/// Clear a mapping from an overlay multicast group to an underlay +/// multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcast2Phys { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// Forwarding entry for an underlay multicast address, specifying +/// which next hops should receive replicated packets. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct McastForwardingEntry { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, + /// Next hops with replication and source filter configuration. + pub next_hops: Vec, +} + +/// Clear all forwarding entries for an underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcastForwarding { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// A forwarding next hop with replication mode and aggregated +/// source filter. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct McastForwardingNextHop { + /// Unicast IPv6 address of the destination sled. + pub next_hop: Ipv6Addr, + /// Replication mode for this next hop. + pub replication: McastReplication, + /// Aggregated source filter for this destination. + pub filter: McastSourceFilter, +} + +/// Replication mode for multicast forwarding. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastReplication { + /// Replicate to front panel ports (egress to external networks). + External, + /// Replicate to sled underlay ports. + Underlay, + /// Replicate to both external and underlay ports. + Both, +} + +/// Source filter for multicast forwarding. +#[derive( + Clone, Debug, Default, Deserialize, Serialize, JsonSchema, PartialEq, +)] +pub struct McastSourceFilter { + /// Filter mode. + pub mode: McastFilterMode, + /// Source addresses to include or exclude. + pub sources: Vec, +} + +/// Filter mode for multicast source filtering. +#[derive( + Clone, + Copy, + Debug, + Default, + Deserialize, + Serialize, + JsonSchema, + PartialEq, + Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastFilterMode { + /// Accept only packets from listed sources (SSM). + Include, + /// Accept packets from all sources except those listed. + /// With an empty sources list this is any-source multicast (ASM). + #[default] + Exclude, +} + +/// Declarative multicast group subscription for an OPTE port. +/// +/// Represents a single group membership with optional source filtering. +/// Empty `sources` means any-source multicast (ASM) and non-empty means +/// source-specific multicast (SSM). +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)] +pub struct MulticastGroupCfg { + /// The multicast group IP address (IPv4 or IPv6). + pub group_ip: IpAddr, + /// Source addresses for source-filtered multicast. + pub sources: Vec, +} diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 7a43386e375..9f36f737dde 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -112,9 +112,6 @@ fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 trust_quorum.period_secs = 60 -# TTL for sled-to-backplane-port mapping cache -# Default: 3600 seconds (1 hour) - detects new sleds and inventory changes -# multicast_reconciler.sled_cache_ttl_secs = 3600 # TTL for backplane topology cache (static platform configuration) # Default: 86400 seconds (24 hours) - refreshed on-demand when validation fails # multicast_reconciler.backplane_cache_ttl_secs = 86400 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 11863e1c681..01f32525e24 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -112,9 +112,6 @@ fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 trust_quorum.period_secs = 60 multicast_reconciler.period_secs = 60 -# TTL for sled-to-backplane-port mapping cache -# Default: 3600 seconds (1 hour) - detects new sleds and inventory changes -# multicast_reconciler.sled_cache_ttl_secs = 3600 # TTL for backplane topology cache (static platform configuration) # Default: 86400 seconds (24 hours) - refreshed on-demand when validation fails # multicast_reconciler.backplane_cache_ttl_secs = 86400 diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index ad4b8d303c7..a53a72af2b2 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -26,6 +26,8 @@ pem.workspace = true regex.workspace = true ring.workspace = true rustls.workspace = true +schemars.workspace = true +serde.workspace = true slog.workspace = true subprocess.workspace = true tempfile.workspace = true diff --git a/test-utils/src/dev/maghemite.rs b/test-utils/src/dev/maghemite.rs index 4c2d85df3ee..e3880c3a622 100644 --- a/test-utils/src/dev/maghemite.rs +++ b/test-utils/src/dev/maghemite.rs @@ -163,6 +163,167 @@ async fn find_mgd_port_in_log(logfile: String) -> Result { } } +/// Simulated DDM admin API instance for integration tests. +/// +/// `ddmd` only runs on real switch zones (requires `libnet`, +/// `opte-ioctl`, and physical network interfaces), so it is not +/// available in the test toolchain like `mgd` is. This provides a +/// dropshot server implementing the DDM admin API endpoints that +/// omicron consumes, backed by in-memory state. +/// +/// Peers can be provided at construction time and updated at runtime +/// via `set_peers`. +pub struct DdmInstance { + pub port: u16, + server: Option>, +} + +/// Server-side peer info type for the DDM sim. +/// +/// Mirrors the wire format of the progenitor-generated +/// `ddm_admin_client::types::PeerInfo` but derives `JsonSchema` so +/// dropshot can serve it directly. +#[derive( + Clone, Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema, +)] +pub struct SimPeerInfo { + pub addr: std::net::Ipv6Addr, + pub host: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub if_name: Option, + pub kind: i64, + pub status: SimPeerStatus, +} + +#[derive( + Clone, + Debug, + serde::Serialize, + serde::Deserialize, + schemars::JsonSchema, + PartialEq, +)] +pub enum SimPeerStatus { + NoContact, + Active, + Expired, +} + +pub type PeerMap = std::collections::HashMap; + +pub struct DdmSimContext { + peers: std::sync::Mutex, +} + +/// Configuration for starting a `DdmInstance`. +pub struct DdmInstanceConfig { + /// Initial peers to serve from `GET /peers`. + pub initial_peers: PeerMap, + /// Port to bind on (0 for auto-assign). + pub port: u16, + /// Logger for the dropshot server. + pub log: slog::Logger, +} + +impl Default for DdmInstanceConfig { + fn default() -> Self { + Self { + initial_peers: PeerMap::new(), + port: 0, + log: slog::Logger::root(slog::Discard, slog::o!()), + } + } +} + +impl DdmInstance { + /// Start a DDM sim server with default configuration (no peers, + /// auto-assigned port). + pub async fn start() -> Result { + Self::start_with_config(DdmInstanceConfig::default()).await + } + + /// Start a DDM sim server with the provided configuration. + pub async fn start_with_config( + config: DdmInstanceConfig, + ) -> Result { + let context = DdmSimContext { + peers: std::sync::Mutex::new(config.initial_peers), + }; + + let dropshot_config = dropshot::ConfigDropshot { + bind_address: std::net::SocketAddr::V6( + std::net::SocketAddrV6::new( + std::net::Ipv6Addr::LOCALHOST, + config.port, + 0, + 0, + ), + ), + ..Default::default() + }; + + let mut api = dropshot::ApiDescription::new(); + api.register(ddm_get_peers).unwrap(); + + let server = dropshot::HttpServerStarter::new( + &dropshot_config, + api, + context, + &config.log, + ) + .map_err(|e| anyhow::anyhow!("failed to start DDM sim server: {e}"))? + .start(); + + let port = server.local_addr().port(); + Ok(Self { port, server: Some(server) }) + } + + /// Replace the peers returned by `GET /peers`. + pub fn set_peers(&self, peers: PeerMap) { + let server = self.server.as_ref().expect("DDM sim server not running"); + *server.app_private().peers.lock().unwrap() = peers; + } + + pub async fn cleanup(&mut self) { + if let Some(server) = self.server.take() { + server.close().await.expect("failed to close DDM sim server"); + } + } +} + +/// Build a `SimPeerInfo` for a sled connected through the specified +/// switch port interface. +/// +/// `host` identifies the peer in DDM (typically the sled hostname). +/// `if_name` is the switch port interface (e.g., `"tfportrear0_0"`). +/// `kind` is the DDM router kind (0 = server, 1 = transit). +pub fn sim_peer_info( + addr: std::net::Ipv6Addr, + host: &str, + if_name: &str, + kind: i64, + status: SimPeerStatus, +) -> SimPeerInfo { + SimPeerInfo { + addr, + host: host.to_string(), + if_name: Some(if_name.to_string()), + kind, + status, + } +} + +#[dropshot::endpoint { + method = GET, + path = "/peers", +}] +async fn ddm_get_peers( + rqctx: dropshot::RequestContext, +) -> Result, dropshot::HttpError> { + let peers = rqctx.context().peers.lock().unwrap().clone(); + Ok(dropshot::HttpResponseOk(peers)) +} + #[cfg(test)] mod tests { use super::find_mgd_port_in_log; diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 58203e10a2b..9839d02f01f 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="d899f9a761bb04bc9b9c88995883196dd691b758de547f7b1836344db5bd5080" -CIDL_SHA256_LINUX_DPD="642b4c99fd9ac3ed7ee8785b2716135c2fe8347812d22bd9f53bbdd3ef9d5e0d" +CIDL_SHA256_ILLUMOS="bf93c4d2c6139dca1bf0abab39be25b20b434d998212d08fd6b2df7b015af268" +CIDL_SHA256_LINUX_DPD="090cb4aa86b674e8cc69ccc5f8d7e7cd146641d6d18bdecff89cc330ab956ab4" CIDL_SHA256_LINUX_SWADM="26c52328b5db50a77f903579521c7e6a89d36e851ba9ecf536aa6db749a7c0e4" diff --git a/tools/dendrite_version b/tools/dendrite_version index db21e2d6820..884f50ff594 100644 --- a/tools/dendrite_version +++ b/tools/dendrite_version @@ -1 +1 @@ -COMMIT="1ddaa5d6b101fbaa2c29eca847111cbef1a272ad" +COMMIT="e10e4f5a993fe950ab1b478abb5dcbfa7aa92091" diff --git a/tools/install_opte.sh b/tools/install_opte.sh index d56523764d9..1f649ec473d 100755 --- a/tools/install_opte.sh +++ b/tools/install_opte.sh @@ -51,6 +51,14 @@ fi # Grab the version of the opte package to install OPTE_VERSION="$(cat "$OMICRON_TOP/tools/opte_version")" +# Check for an OPTE override. When set, the desired OPTE version isn't +# published to the helios pkg repo yet, so we download and install directly +# from the override p5p built by OPTE CI on buildomat. +source "$OMICRON_TOP/tools/opte_version_override" +if [[ "x$OPTE_COMMIT" != "x" ]]; then + echo "OPTE override active: installing from p5p for commit $OPTE_COMMIT" +fi + OMICRON_FROZEN_PKG_COMMENT="OMICRON-PINNED-PACKAGE" # Once we install, we mark the package as frozen at that particular version. @@ -71,16 +79,36 @@ if PKG_FROZEN=$(pkg freeze | grep driver/network/opte); then pfexec pkg unfreeze driver/network/opte fi -# Actually install the xde kernel module and opteadm tool -RC=0 -pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? -if [[ "$RC" -eq 0 ]]; then - echo "xde driver installed successfully" -elif [[ "$RC" -eq 4 ]]; then - echo "Correct xde driver already installed" +if [[ "x$OPTE_COMMIT" != "x" ]]; then + # Install from the override p5p archive built by OPTE CI. + P5P_URL="https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/$OPTE_COMMIT/opte.p5p" + P5P_PATH="/tmp/opte-override.p5p" + echo "Downloading override p5p from $P5P_URL" + curl -fL -o "$P5P_PATH" "$P5P_URL" + + RC=0 + pfexec pkg install -g "$P5P_PATH" "driver/network/opte@$OPTE_VERSION" || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed from override p5p" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver from override p5p failed" + exit "$RC" + fi + rm -f "$P5P_PATH" else - echo "Installing xde driver failed" - exit "$RC" + # Install the published version from the helios pkg repo. + RC=0 + pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed successfully" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver failed" + exit "$RC" + fi fi RC=0 @@ -97,13 +125,3 @@ if [[ "$RC" -ne 0 ]]; then echo "The \`opteadm\` administration tool is not on your path." echo "You may add \"/opt/oxide/opte/bin\" to your path to access it." fi - -source $OMICRON_TOP/tools/opte_version_override - -if [[ "x$OPTE_COMMIT" != "x" ]]; then - set +x - curl -fOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true -fi diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index e1553c6f6af..7580a9c8471 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1 +1 @@ -COMMIT="4d1f20f793da102b29b914569725ebc9fdf746dd" +COMMIT="c3c3032f8bdc91d6faf2b36e05b8375a0980765c" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index e1553c6f6af..7580a9c8471 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1 +1 @@ -COMMIT="4d1f20f793da102b29b914569725ebc9fdf746dd" +COMMIT="c3c3032f8bdc91d6faf2b36e05b8375a0980765c" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 36429923417..9ff2d430404 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="dd07d2ea491842cce28fd4eabc0f957f7672a75a8e4d92c31630d4332cb40ebd" -MGD_LINUX_SHA256="64aadc95e3e9bbd8aa70fa15aada3667932e252d1b4dd253ecc76f2d8552d6cd" \ No newline at end of file +CIDL_SHA256="f65bf058322013feb2b5771e24046b0c6953d4e9324f8f48374caf7565845851" +MGD_LINUX_SHA256="98037bfe6718840beccab1e3f9d063406e2aabfe4a5b548d7301a2a9637042a8" \ No newline at end of file diff --git a/tools/opte_version_override b/tools/opte_version_override index 8d57f7ae9f4..66995188d0d 100644 --- a/tools/opte_version_override +++ b/tools/opte_version_override @@ -1,5 +1,18 @@ #!/usr/bin/env bash -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script -OPTE_COMMIT="" +# Override for using an unpublished OPTE version. When OPTE_COMMIT is set, +# the override p5p package is downloaded from buildomat and used instead of +# the version published in the helios pkg repo. The p5p is built by the +# opte-p5p buildomat job and published at: +# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/{commit}/opte.p5p +# +# Consumers: +# - install_opte.sh installs directly from the override p5p +# - releng image builds use extra_packages with the p5p as a pkg source +# - deploy.sh installs from the override p5p on the running system +# - ci_check_opte_ver.sh skips version consistency checks +# +# To activate: set OPTE_COMMIT to the git commit hash of the OPTE build. +# +# To deactivate (once the new version is published): set OPTE_COMMIT to "". +OPTE_COMMIT="c570ac2126dbbebbd8e98e73b580c5be6b7e460e"