From 042f398c27e01ae1170fc20e9a4cab0e02414066 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Tue, 5 May 2026 07:29:07 +0000 Subject: [PATCH] [internal-dns] register and publish ddmd in the switch zone DDMD has always run in the switch zone alongside Dendrite, MGS, and MGD, but it was never registered in internal DNS, leaving no path for a cross-host consumer to discover it. This adds `ServiceName::Ddm`, plumbs `ddm_port` through the host-zone switch (RSS plan + reconfigurator DNS execution), threads an `Overridables::ddm_ports` map for the test suite, and lands a `DdmInstance` dropshot sim in test utils so that the test harness registers a real DDM port in DNS the same way it does for the other switch-zone services. We also drop the duplicate DDMD_PORT const in `ddm-admin-client` in favor of the canonical `omicron_common::address::DDMD_PORT`. Same-host callers continue to use `Client::localhost()`. This was extracted from the multicast PR (zl/multicast-mgd-ddm), which uses ddmd cross-host as the first DNS-resolved consumer, as Nexus is the consumer. --- clients/ddm-admin-client/src/lib.rs | 4 +- internal-dns/types/src/config.rs | 71 +++++++++++++++- internal-dns/types/src/names.rs | 5 +- nexus/reconfigurator/execution/src/dns.rs | 7 +- .../execution/src/test_utils.rs | 2 + nexus/reconfigurator/planning/src/example.rs | 3 +- nexus/test-utils/src/nexus_test.rs | 4 + nexus/test-utils/src/starter.rs | 30 +++++++ .../tests/integration_tests/initialization.rs | 7 ++ nexus/types/src/deployment/execution/dns.rs | 1 + .../src/deployment/execution/overridables.rs | 13 +++ sled-agent/rack-setup/src/plan/service.rs | 9 ++- test-utils/src/dev/maghemite.rs | 80 +++++++++++++++++++ 13 files changed, 222 insertions(+), 14 deletions(-) diff --git a/clients/ddm-admin-client/src/lib.rs b/clients/ddm-admin-client/src/lib.rs index 7a8b56d499d..466a8883918 100644 --- a/clients/ddm-admin-client/src/lib.rs +++ b/clients/ddm-admin-client/src/lib.rs @@ -13,6 +13,7 @@ pub use ddm_admin_client::types; use ddm_admin_client::Client as InnerClient; use either::Either; +use omicron_common::address::DDMD_PORT; use oxnet::Ipv6Net; use sled_hardware_types::underlay::BOOTSTRAP_MASK; use sled_hardware_types::underlay::BOOTSTRAP_PREFIX; @@ -26,9 +27,6 @@ use thiserror::Error; use crate::types::EnableStatsRequest; -// TODO-cleanup Is it okay to hardcode this port number here? -const DDMD_PORT: u16 = 8000; - #[derive(Debug, Error, SlogInlineError)] pub enum DdmError { #[error("Failed to construct an HTTP client:")] diff --git a/internal-dns/types/src/config.rs b/internal-dns/types/src/config.rs index d5bef144343..f6b04753a77 100644 --- a/internal-dns/types/src/config.rs +++ b/internal-dns/types/src/config.rs @@ -399,6 +399,7 @@ impl DnsConfigBuilder { dendrite_port: u16, mgs_port: u16, mgd_port: u16, + ddm_port: u16, ) -> anyhow::Result<()> { let zone = self.host_dendrite(sled_id, switch_zone_ip)?; self.service_backend_zone(ServiceName::Dendrite, &zone, dendrite_port)?; @@ -407,7 +408,8 @@ impl DnsConfigBuilder { &zone, mgs_port, )?; - self.service_backend_zone(ServiceName::Mgd, &zone, mgd_port) + self.service_backend_zone(ServiceName::Mgd, &zone, mgd_port)?; + self.service_backend_zone(ServiceName::Ddm, &zone, ddm_port) } /// Higher-level shorthand for adding a Nexus zone with both its internal @@ -731,7 +733,7 @@ impl DnsConfigBuilder { #[cfg(test)] mod test { - use super::{DnsConfigBuilder, Host, ServiceName}; + use super::{DnsConfigBuilder, DnsRecord, Host, ServiceName}; use crate::{config::Zone, names::DNS_ZONE}; use omicron_common::api::external::Generation; use omicron_uuid_kinds::{OmicronZoneUuid, SledUuid}; @@ -779,6 +781,8 @@ mod test { "_oximeter-reader._tcp", ); assert_eq!(ServiceName::Dendrite.dns_name(), "_dendrite._tcp",); + assert_eq!(ServiceName::Mgd.dns_name(), "_mgd._tcp",); + assert_eq!(ServiceName::Ddm.dns_name(), "_ddm._tcp",); assert_eq!( ServiceName::CruciblePantry.dns_name(), "_crucible-pantry._tcp", @@ -796,6 +800,69 @@ mod test { ); } + #[test] + fn host_zone_switch_publishes_all_services() { + let sled_uuid: SledUuid = + "001de000-51ed-4000-8000-000000000001".parse().unwrap(); + let switch_zone_ip = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1); + + // Use distinct port numbers so an arg-order swap in `host_zone_switch` + // surfaces as a port mismatch on the affected service. + let dendrite_port = 11; + let mgs_port = 13; + let mgd_port = 17; + let ddm_port = 19; + + let mut builder = DnsConfigBuilder::new(); + builder + .host_zone_switch( + sled_uuid, + switch_zone_ip, + dendrite_port, + mgs_port, + mgd_port, + ddm_port, + ) + .unwrap(); + + let config = builder.build_full_config_for_initial_generation(); + + let mut by_name: BTreeMap<&str, &[DnsRecord]> = BTreeMap::new(); + for zone in &config.zones { + for (name, records) in &zone.records { + by_name.insert(name.as_str(), records.as_slice()); + } + } + + for (expected_name, expected_port) in [ + ("_dendrite._tcp", dendrite_port), + ("_mgs._tcp", mgs_port), + ("_mgd._tcp", mgd_port), + ("_ddm._tcp", ddm_port), + ] { + let records = by_name.get(expected_name).unwrap_or_else(|| { + panic!( + "expected {expected_name} in published switch-zone \ + services; got {by_name:?}" + ) + }); + let srv_port = records + .iter() + .find_map(|r| match r { + DnsRecord::Srv(s) => Some(s.port), + _ => None, + }) + .unwrap_or_else(|| { + panic!("no SRV record for {expected_name}: {records:?}") + }); + + assert_eq!( + srv_port, expected_port, + "wrong SRV port for {expected_name}" + ); + } + } + #[test] fn display_hosts() { let sled_uuid = SledUuid::nil(); diff --git a/internal-dns/types/src/names.rs b/internal-dns/types/src/names.rs index 73b2439e48e..105d0222f3c 100644 --- a/internal-dns/types/src/names.rs +++ b/internal-dns/types/src/names.rs @@ -75,6 +75,7 @@ pub enum ServiceName { BoundaryNtp, InternalNtp, Mgd, + Ddm, } impl ServiceName { @@ -116,6 +117,7 @@ impl ServiceName { ServiceName::BoundaryNtp => "boundary-ntp", ServiceName::InternalNtp => "internal-ntp", ServiceName::Mgd => "mgd", + ServiceName::Ddm => "ddm", } } @@ -144,7 +146,8 @@ impl ServiceName { | ServiceName::CruciblePantry | ServiceName::BoundaryNtp | ServiceName::InternalNtp - | ServiceName::Mgd => { + | ServiceName::Mgd + | ServiceName::Ddm => { format!("_{}._tcp", self.service_kind()) } ServiceName::SledAgent(id) => { diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 685c7c85e6f..0a85c4dd114 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -988,9 +988,8 @@ mod test { // the previous pass (i.e., that corresponds to an Omicron zone). // // There are some ServiceNames missing here because they are not part of - // our representative config (e.g., ClickhouseKeeper) or they don't - // currently have DNS record at all (e.g., SledAgent, Maghemite, Mgd, - // Tfport). + // our representative config (e.g., ClickhouseKeeper) or because they + // do not currently have a DNS record at all (e.g., SledAgent). let mut srv_kinds_expected = BTreeSet::from([ ServiceName::Clickhouse, ServiceName::ClickhouseNative, @@ -1001,6 +1000,8 @@ mod test { ServiceName::NexusLockstep, ServiceName::Oximeter, ServiceName::Dendrite, + ServiceName::Mgd, + ServiceName::Ddm, ServiceName::CruciblePantry, ServiceName::BoundaryNtp, ServiceName::InternalNtp, diff --git a/nexus/reconfigurator/execution/src/test_utils.rs b/nexus/reconfigurator/execution/src/test_utils.rs index cd46adacd0b..fdb17289225 100644 --- a/nexus/reconfigurator/execution/src/test_utils.rs +++ b/nexus/reconfigurator/execution/src/test_utils.rs @@ -113,10 +113,12 @@ pub fn overridables_for_test( let dendrite_port = cptestctx.dendrite.read().unwrap().get(&switch_slot).unwrap().port; let mgd_port = cptestctx.mgd.get(&switch_slot).unwrap().port; + let ddm_port = cptestctx.ddm.get(&switch_slot).unwrap().port; overrides.override_switch_zone_ip(sled_id, ip); overrides.override_dendrite_port(sled_id, dendrite_port); overrides.override_mgs_port(sled_id, mgs_port); overrides.override_mgd_port(sled_id, mgd_port); + overrides.override_ddm_port(sled_id, ddm_port); } overrides } diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index a1f865e2934..7dbbf3640dc 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -1854,7 +1854,8 @@ mod tests { | ServiceName::RepoDepot | ServiceName::ManagementGatewayService | ServiceName::Dendrite - | ServiceName::Mgd => { + | ServiceName::Mgd + | ServiceName::Ddm => { out.insert(service, Ok(())); } // InternalNtp is too large to fit in a single DNS packet and diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 693aea88732..329f6f37d29 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -117,6 +117,7 @@ pub struct ControlPlaneTestContext { /// Ports of stopped dendrite instances (for use by start_dendrite) pub stopped_dendrite_ports: RwLock>, pub mgd: HashMap, + pub ddm: HashMap, pub external_dns_zone_name: String, pub external_dns: TransientDnsServer, pub internal_dns: TransientDnsServer, @@ -320,6 +321,9 @@ impl ControlPlaneTestContext { for (_, mut mgd) in self.mgd { mgd.cleanup().await.unwrap(); } + for (_, mut ddm) in self.ddm { + ddm.cleanup().await; + } self.logctx.cleanup_successful(); } } diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index aa9c5cbd268..8a646afea12 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -146,6 +146,7 @@ pub struct ControlPlaneStarter<'a, N: NexusServer> { pub gateway: BTreeMap, pub dendrite: RwLock>, pub mgd: HashMap, + pub ddm: HashMap, // NOTE: Only exists after starting Nexus, until external Nexus is // initialized. @@ -203,6 +204,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { gateway: BTreeMap::new(), dendrite: RwLock::new(HashMap::new()), mgd: HashMap::new(), + ddm: HashMap::new(), nexus_internal: None, nexus_internal_addr: None, external_dns_zone_name: None, @@ -461,6 +463,17 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { self.config.pkg.mgd.insert(switch_slot, config); } + pub async fn start_ddm(&mut self, switch_slot: SwitchSlot) { + let log = &self.logctx.log; + debug!(log, "Starting DDM sim"; "switch_slot" => ?switch_slot); + + let ddm = dev::maghemite::DdmInstance::start().await.unwrap(); + let port = ddm.port; + self.ddm.insert(switch_slot, ddm); + + debug!(log, "DDM sim started"; "port" => port); + } + pub async fn record_switch_dns( &mut self, sled_id: SledUuid, @@ -482,6 +495,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { self.dendrite.read().unwrap().get(&switch_slot).unwrap().port, self.gateway.get(&switch_slot).unwrap().port, self.mgd.get(&switch_slot).unwrap().port, + self.ddm.get(&switch_slot).unwrap().port, ) .unwrap() } @@ -1250,6 +1264,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { dendrite: RwLock::new(self.dendrite.into_inner().unwrap()), stopped_dendrite_ports: RwLock::new(HashMap::new()), mgd: self.mgd, + ddm: self.ddm, external_dns_zone_name: self.external_dns_zone_name.unwrap(), external_dns: self.external_dns.unwrap(), internal_dns: self.internal_dns.unwrap(), @@ -1291,6 +1306,9 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { for (_, mut mgd) in self.mgd { mgd.cleanup().await.unwrap(); } + for (_, mut ddm) in self.ddm { + ddm.cleanup().await; + } self.logctx.cleanup_successful(); } @@ -1631,6 +1649,12 @@ pub(crate) async fn setup_with_config_impl( builder.start_mgd(SwitchSlot::Switch0).boxed() }), ), + ( + "start_ddm_switch0", + Box::new(|builder| { + builder.start_ddm(SwitchSlot::Switch0).boxed() + }), + ), ( "record_switch_dns", Box::new(|builder| { @@ -1675,6 +1699,12 @@ pub(crate) async fn setup_with_config_impl( builder.start_mgd(SwitchSlot::Switch1).boxed() }), ), + ( + "start_ddm_switch1", + Box::new(|builder| { + builder.start_ddm(SwitchSlot::Switch1).boxed() + }), + ), ( "record_switch_dns", Box::new(|builder| { diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index 350757cf1de..714880feb37 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -158,6 +158,11 @@ async fn test_nexus_boots_before_dendrite() { starter.start_mgd(SwitchSlot::Switch1).await; info!(log, "Started mgd"); + info!(log, "Starting ddm"); + starter.start_ddm(SwitchSlot::Switch0).await; + starter.start_ddm(SwitchSlot::Switch1).await; + info!(log, "Started ddm"); + info!(log, "Populating internal DNS records"); starter .record_switch_dns( @@ -197,6 +202,8 @@ async fn nexus_schema_test_setup( starter.start_dendrite(SwitchSlot::Switch1).await; starter.start_mgd(SwitchSlot::Switch0).await; starter.start_mgd(SwitchSlot::Switch1).await; + starter.start_ddm(SwitchSlot::Switch0).await; + starter.start_ddm(SwitchSlot::Switch1).await; starter.populate_internal_dns().await; } diff --git a/nexus/types/src/deployment/execution/dns.rs b/nexus/types/src/deployment/execution/dns.rs index 009377fd8d9..3730576eda2 100644 --- a/nexus/types/src/deployment/execution/dns.rs +++ b/nexus/types/src/deployment/execution/dns.rs @@ -158,6 +158,7 @@ pub fn blueprint_internal_dns_config( overrides.dendrite_port(scrimlet.id()), overrides.mgs_port(scrimlet.id()), overrides.mgd_port(scrimlet.id()), + overrides.ddm_port(scrimlet.id()), )?; } diff --git a/nexus/types/src/deployment/execution/overridables.rs b/nexus/types/src/deployment/execution/overridables.rs index 881a7c49bdd..7dc3ae0bf4d 100644 --- a/nexus/types/src/deployment/execution/overridables.rs +++ b/nexus/types/src/deployment/execution/overridables.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use omicron_common::address::DDMD_PORT; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::Ipv6Subnet; use omicron_common::address::MGD_PORT; @@ -29,6 +30,8 @@ pub struct Overridables { pub mgs_ports: BTreeMap, /// map: sled id -> TCP port on which that sled's MGD is listening pub mgd_ports: BTreeMap, + /// map: sled id -> TCP port on which that sled's DDM is listening + pub ddm_ports: BTreeMap, /// map: sled id -> IP address of the sled's switch zone pub switch_zone_ips: BTreeMap, } @@ -67,6 +70,16 @@ impl Overridables { self.mgd_ports.get(&sled_id).copied().unwrap_or(MGD_PORT) } + /// Specify the TCP port on which this sled's DDM is listening + pub fn override_ddm_port(&mut self, sled_id: SledUuid, port: u16) { + self.ddm_ports.insert(sled_id, port); + } + + /// Returns the TCP port on which this sled's DDM is listening + pub fn ddm_port(&self, sled_id: SledUuid) -> u16 { + self.ddm_ports.get(&sled_id).copied().unwrap_or(DDMD_PORT) + } + /// Specify the IP address of this switch zone pub fn override_switch_zone_ip( &mut self, diff --git a/sled-agent/rack-setup/src/plan/service.rs b/sled-agent/rack-setup/src/plan/service.rs index 2ef6d79489a..59ffc7eb64e 100644 --- a/sled-agent/rack-setup/src/plan/service.rs +++ b/sled-agent/rack-setup/src/plan/service.rs @@ -29,10 +29,10 @@ use nexus_types::deployment::{ }; use nexus_types::external_api::sled::SledState; use omicron_common::address::{ - CP_SERVICES_RESERVED_ADDRESSES, DENDRITE_PORT, DNS_HTTP_PORT, DNS_PORT, - Ipv6Subnet, MGD_PORT, MGS_PORT, NEXUS_INTERNAL_PORT, NEXUS_LOCKSTEP_PORT, - NTP_PORT, NUM_SOURCE_NAT_PORTS, REPO_DEPOT_PORT, ReservedRackSubnet, - SLED_PREFIX, SLED_RESERVED_ADDRESSES, get_sled_address, + CP_SERVICES_RESERVED_ADDRESSES, DDMD_PORT, DENDRITE_PORT, DNS_HTTP_PORT, + DNS_PORT, Ipv6Subnet, MGD_PORT, MGS_PORT, NEXUS_INTERNAL_PORT, + NEXUS_LOCKSTEP_PORT, NTP_PORT, NUM_SOURCE_NAT_PORTS, REPO_DEPOT_PORT, + ReservedRackSubnet, SLED_PREFIX, SLED_RESERVED_ADDRESSES, get_sled_address, get_switch_zone_address, }; use omicron_common::api::external::{Generation, MacAddr, Vni}; @@ -341,6 +341,7 @@ impl ServicePlan { DENDRITE_PORT, MGS_PORT, MGD_PORT, + DDMD_PORT, ) .unwrap(); } diff --git a/test-utils/src/dev/maghemite.rs b/test-utils/src/dev/maghemite.rs index 4c2d85df3ee..d3cf6524f1a 100644 --- a/test-utils/src/dev/maghemite.rs +++ b/test-utils/src/dev/maghemite.rs @@ -4,11 +4,13 @@ //! Tools for managing Maghemite during development +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::path::{Path, PathBuf}; use std::process::Stdio; use std::time::Duration; use anyhow::Context; +use slog::{Discard, Logger, o}; use tempfile::TempDir; use tokio::{ fs::File, @@ -163,13 +165,91 @@ async fn find_mgd_port_in_log(logfile: String) -> Result { } } +/// In-process stand-in for the `ddmd` (Delay Driven Multipath daemon) +/// admin API. +/// +/// `ddmd` runs in sled global zones and switch zones in real deployments, +/// and depends on illumos networking facilities not available in a generic +/// dev test toolchain the way `mgd` is. This binds a dropshot server on an +/// auto-assigned port so the test suite has a real socket to publish in +/// internal DNS as `ServiceName::Ddm`. +/// +/// This currently has no registered routes. Any integration needing +/// concrete endpoints (e.g., peer lists) must extend the `ApiDescription`. +pub struct DdmInstance { + pub port: u16, + server: Option>, +} + +impl DdmInstance { + /// Start a DDM sim server bound to a random localhost port. + pub async fn start() -> Result { + let dropshot_config = dropshot::ConfigDropshot { + bind_address: SocketAddr::V6(SocketAddrV6::new( + Ipv6Addr::LOCALHOST, + 0, + 0, + 0, + )), + ..Default::default() + }; + + let api: dropshot::ApiDescription<()> = dropshot::ApiDescription::new(); + let log = Logger::root(Discard, o!()); + + let server = dropshot::ServerBuilder::new(api, (), log) + .config(dropshot_config) + .start() + .context("failed to start DDM sim server")?; + + let port = server.local_addr().port(); + Ok(Self { port, server: Some(server) }) + } + + pub async fn cleanup(&mut self) { + if let Some(server) = self.server.take() { + server.close().await.expect("failed to close DDM sim server"); + } + } +} + +impl Drop for DdmInstance { + fn drop(&mut self) { + if self.server.is_some() { + eprintln!( + "WARN: dropped DdmInstance without cleaning it up first \ + (the dropshot server's tokio task may still be running)" + ); + } + } +} + #[cfg(test)] mod tests { + use super::DdmInstance; use super::find_mgd_port_in_log; use std::io::Write; use std::process::Stdio; use tempfile::NamedTempFile; + /// Smoke-test `DdmInstance`. We bind and serve a 404 for an unregistered + /// route, then shut down cleanly. + #[tokio::test] + async fn test_ddm_sim_binds_and_serves_404() { + let mut sim = DdmInstance::start().await.expect("DDM sim starts"); + assert!(sim.port > 0, "DDM sim should auto-assign a port"); + + let url = format!("http://[::1]:{}/peers", sim.port); + let resp = reqwest::get(&url).await.expect("server reachable"); + assert_eq!( + resp.status(), + reqwest::StatusCode::NOT_FOUND, + "no routes registered yet: expected 404" + ); + + sim.cleanup().await; + } + const EXPECTED_PORT: u16 = 4676; #[tokio::test]