diff --git a/clients/nexus-lockstep-client/src/lib.rs b/clients/nexus-lockstep-client/src/lib.rs index 1014b6afc16..a7f09251b3d 100644 --- a/clients/nexus-lockstep-client/src/lib.rs +++ b/clients/nexus-lockstep-client/src/lib.rs @@ -50,7 +50,6 @@ progenitor::generate_api!( DnsConfigParams = nexus_types::internal_api::params::DnsConfigParams, DnsConfigZone = nexus_types::internal_api::params::DnsConfigZone, DnsRecord = nexus_types::internal_api::params::DnsRecord, - ExternalPortDiscovery = nexus_types::internal_api::params::ExternalPortDiscovery, Generation = omicron_common::api::external::Generation, ImportExportPolicy = sled_agent_types::early_networking::ImportExportPolicy, MacAddr = omicron_common::api::external::MacAddr, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 21171c33498..8edba8f393b 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -81,6 +81,8 @@ use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; use nexus_types::internal_api::background::SupportBundleEreportStatus; +use nexus_types::internal_api::background::SwitchPortPopulatorStatus; +use nexus_types::internal_api::background::SwitchPortPopulatorStatusKind; use nexus_types::internal_api::background::TrustQuorumManagerStatus; use nexus_types::internal_api::background::TufArtifactReplicationCounters; use nexus_types::internal_api::background::TufArtifactReplicationRequest; @@ -1369,6 +1371,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "trust_quorum_manager" => { print_task_trust_quorum_manager(details); } + "populate_switch_ports" => { + print_task_populate_switch_ports(details); + } _ => { println!( "warning: unknown background task: {:?} \ @@ -3940,6 +3945,40 @@ fn print_task_trust_quorum_manager(details: &serde_json::Value) { } } +fn print_task_populate_switch_ports(details: &serde_json::Value) { + fn print_one( + name: &str, + result: Result, + ) { + match result { + Ok(SwitchPortPopulatorStatusKind::Populated { num_ports }) => { + println!("{name}: populated {num_ports} ports"); + } + Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated) => { + println!("{name} skipped: previously populated ports"); + } + Err(err) => println!("{name} failed: {err}"), + } + } + + let status = match serde_json::from_value::( + details.clone(), + ) { + Ok(status) => status, + Err(error) => { + eprintln!( + "warning: failed to interpret task details: {:?}: {:#?}", + error, details + ); + return; + } + }; + + let SwitchPortPopulatorStatus { switch0, switch1 } = status; + print_one("switch0", switch0); + print_one("switch1", switch1); +} + const ERRICON: &str = "/!\\"; fn warn_if_nonzero(n: usize) -> &'static str { diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index f6457634107..cadc24bf151 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -176,6 +176,11 @@ task: "physical_disk_adoption" ensure new physical disks are automatically marked in-service +task: "populate_switch_ports" + one-time population of the `switch_port` table containing all QSFP ports + managed by dendrite + + task: "probe_distributor" distributes networking probe zones to sleds @@ -439,6 +444,11 @@ task: "physical_disk_adoption" ensure new physical disks are automatically marked in-service +task: "populate_switch_ports" + one-time population of the `switch_port` table containing all QSFP ports + managed by dendrite + + task: "probe_distributor" distributes networking probe zones to sleds @@ -689,6 +699,11 @@ task: "physical_disk_adoption" ensure new physical disks are automatically marked in-service +task: "populate_switch_ports" + one-time population of the `switch_port` table containing all QSFP ports + managed by dendrite + + task: "probe_distributor" distributes networking probe zones to sleds diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 8662889facf..40bae0720e1 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -411,6 +411,11 @@ task: "physical_disk_adoption" ensure new physical disks are automatically marked in-service +task: "populate_switch_ports" + one-time population of the `switch_port` table containing all QSFP ports + managed by dendrite + + task: "probe_distributor" distributes networking probe zones to sleds @@ -854,6 +859,13 @@ task: "physical_disk_adoption" started at (s ago) and ran for ms last completion reported error: task disabled +task: "populate_switch_ports" + configured period: every s + last completed activation: , triggered by + started at (s ago) and ran for ms +switch0 failed: failed to look up dendrite clients: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } +switch1 failed: failed to look up dendrite clients: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + task: "probe_distributor" configured period: every m last completed activation: , triggered by @@ -1535,6 +1547,13 @@ task: "physical_disk_adoption" started at (s ago) and ran for ms last completion reported error: task disabled +task: "populate_switch_ports" + configured period: every s + last completed activation: , triggered by + started at (s ago) and ran for ms +switch0 failed: failed to look up dendrite clients: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } +switch1 failed: failed to look up dendrite clients: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + task: "probe_distributor" configured period: every m last completed activation: , triggered by diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 9f035510bc9..ea8b06f55f8 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -442,6 +442,8 @@ pub struct BackgroundTaskConfig { pub audit_log_timeout_incomplete: AuditLogTimeoutIncompleteConfig, /// configuration for audit log cleanup (retention) task pub audit_log_cleanup: AuditLogCleanupConfig, + /// configuration for populate switch ports task + pub populate_switch_ports: PopulateSwitchPortsConfig, } #[serde_as] @@ -488,6 +490,15 @@ pub struct AuditLogCleanupConfig { pub max_deleted_per_activation: u32, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct PopulateSwitchPortsConfig { + /// period (in seconds) for periodic activations of the background task that + /// attempts to populate the `switch_port` table. + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct DnsTasksConfig { @@ -1339,6 +1350,7 @@ mod test { audit_log_cleanup.period_secs = 600 audit_log_cleanup.retention_days = 90 audit_log_cleanup.max_deleted_per_activation = 10000 + populate_switch_ports.period_secs = 31 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1620,6 +1632,9 @@ mod test { retention_days: NonZeroU32::new(90).unwrap(), max_deleted_per_activation: 10_000, }, + populate_switch_ports: PopulateSwitchPortsConfig { + period_secs: Duration::from_secs(31), + }, }, multicast: MulticastConfig { enabled: false }, default_region_allocation_strategy: @@ -1735,6 +1750,7 @@ mod test { audit_log_cleanup.period_secs = 600 audit_log_cleanup.retention_days = 90 audit_log_cleanup.max_deleted_per_activation = 10000 + populate_switch_ports.period_secs = 31 [default_region_allocation_strategy] type = "random" diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 0a24e5eb3e2..6e072b2ec73 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -118,6 +118,7 @@ slog.workspace = true slog-async.workspace = true slog-dtrace.workspace = true slog-error-chain.workspace = true +strum.workspace = true swrite.workspace = true display-error-chain.workspace = true slog-term.workspace = true diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index 62b9f0c66df..165d5a3fc50 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -62,6 +62,7 @@ pub struct BackgroundTasks { pub task_trust_quorum_manager: Activator, pub task_attached_subnet_manager: Activator, pub task_session_cleanup: Activator, + pub task_populate_switch_ports: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 6196f141184..c47518f283c 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -204,6 +204,7 @@ audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 audit_log_cleanup.period_secs = 600 audit_log_cleanup.retention_days = 90 audit_log_cleanup.max_deleted_per_activation = 10000 +populate_switch_ports.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 150d6943170..6ed809b0833 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -188,6 +188,7 @@ audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 audit_log_cleanup.period_secs = 600 audit_log_cleanup.retention_days = 90 audit_log_cleanup.max_deleted_per_activation = 10000 +populate_switch_ports.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index bdf9ee01638..cab3275dc77 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -145,6 +145,7 @@ use super::tasks::v2p_mappings::V2PManager; use super::tasks::vpc_routes; use super::tasks::webhook_deliverator; use crate::Nexus; +use crate::app::background::tasks::populate_switch_ports; use crate::app::oximeter::PRODUCER_LEASE_DURATION; use crate::app::quiesce::NexusQuiesceHandle; use crate::app::saga::StartSaga; @@ -277,6 +278,7 @@ impl BackgroundTasksInitializer { task_trust_quorum_manager: Activator::new(), task_attached_subnet_manager: Activator::new(), task_session_cleanup: Activator::new(), + task_populate_switch_ports: Activator::new(), // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as @@ -372,6 +374,7 @@ impl BackgroundTasksInitializer { task_session_cleanup, task_audit_log_timeout_incomplete, task_audit_log_cleanup, + task_populate_switch_ports, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -1224,7 +1227,7 @@ impl BackgroundTasksInitializer { description: "distributes attached subnets to sleds and switch", period: config.attached_subnet_manager.period_secs, task_impl: Box::new(attached_subnets::Manager::new( - resolver, + resolver.clone(), datastore.clone(), )), opctx: opctx.child(BTreeMap::new()), @@ -1272,7 +1275,7 @@ impl BackgroundTasksInitializer { than the retention period", period: config.audit_log_cleanup.period_secs, task_impl: Box::new(audit_log_cleanup::AuditLogCleanup::new( - datastore, + datastore.clone(), config.audit_log_cleanup.retention_days, config.audit_log_cleanup.max_deleted_per_activation, )), @@ -1281,6 +1284,23 @@ impl BackgroundTasksInitializer { activator: task_audit_log_cleanup, }); + driver.register(TaskDefinition { + name: "populate_switch_ports", + description: "one-time population of the `switch_port` table \ + containing all QSFP ports managed by dendrite", + period: config.populate_switch_ports.period_secs, + task_impl: Box::new( + populate_switch_ports::SwitchPortPopulator::new( + rack_id, + datastore.clone(), + resolver.clone(), + ), + ), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_populate_switch_ports, + }); + driver } } diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index fdcb45ef8d0..f61f2ffac0d 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -37,6 +37,7 @@ pub mod nat_cleanup; pub mod networking; pub mod phantom_disks; pub mod physical_disk_adoption; +pub mod populate_switch_ports; pub mod probe_distributor; pub mod read_only_region_replacement_start; pub mod reconfigurator_config; diff --git a/nexus/src/app/background/tasks/populate_switch_ports.rs b/nexus/src/app/background/tasks/populate_switch_ports.rs new file mode 100644 index 00000000000..8b8cdb94967 --- /dev/null +++ b/nexus/src/app/background/tasks/populate_switch_ports.rs @@ -0,0 +1,406 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for populating the `switch_port` table. +//! +//! Unlike most Nexus background tasks, this task is a critical part of rack +//! setup, and does nothing after rack setup. This is a bit strange, so for +//! context we'll provide both history and "current shortcomings we eventually +//! need to address". +//! +//! There is a database table named `switch_port`. It's expected to contain the +//! names of the front-facing ports that can be configured by operators for each +//! switch. Today, this list is _completely static_ for a given rack, although +//! it differs between environments (e.g., real sidecars always have `qsfp0` +//! through `qsfp31`; some test environments, like the `helios/deploy` CI job, +//! only have `qsfp0`). A client can determine the list by querying dendrite +//! (`dpd`, via the `port_list()` endpoint). +//! +//! Inserting networking configuration to the database requires this table to be +//! populated. Various other tables take port names as part of their +//! configuration, and the inserts require that the port name actually exist on +//! the switch to be a valid configuration. +//! +//! The initial set of networking configuration provided at RSS time is inserted +//! by Nexus inside the handoff endpoint called by RSS (running in sled-agent). +//! This means for RSS to succeed, the `switch_port` table must be populated +//! with at least the names of the ports used in that initial networking +//! configuration. +//! +//! Prior to the existence of this background task, the responsibility for +//! populating `switch_port` was split between RSS and Nexus: +//! +//! 1. RSS would build a `HashMap` specifying the IP +//! addresses of one or both switch zones. +//! 2. Nexus, inside the handoff endpoint, would query `dpd` for each of the +//! entries in that hash map to populate `switch_port` prior to inserting the +//! rest of the networking configuration. +//! +//! Historically, this caused some problems. RSS didn't know whether it needed +//! to find the IPs of both switch zones or whether it was sufficient to find +//! just one. Originally it would try to find both for some amount of time, then +//! give up and proceed as long as it had found one; this led to RSS handoff +//! failures if the initial networking configuration contained ports on the +//! switch it didn't find within the timeout (omicron#9678). This was fixed by +//! forcing RSS to wait until it discovered the IP of any switch that had a +//! configured port in the initial network config. +//! +//! That still left a gap: it was possible to provide an initial network config +//! that only used a port on switch0 (or switch1), RSS would only specify an IP +//! address for that switch, and therefore Nexus would only fill in the +//! `switch_port` tables from that one switch. Any future attempt to configure a +//! port on the other switch would never succeed, because nothing in Nexus ever +//! came back to fill in "the other switch" in the `switch_port` table. +//! +//! This background task fixes that problem. When Nexus starts, it will +//! periodically activate this task, and it will attempt to find both switch +//! zones, ask `dpd` for the list of ports, and populate `switch_port`. It will +//! go inert once it's successfully contacted `dpd` in both switch zones, but +//! will keep trying forever until it has done so. This means we can perform an +//! RSS handoff with only one switch present and configured, then later add a +//! second switch, and this task will fill in that switch's ports in +//! `switch_port`. +//! +//! There are still some things here that aren't ideal: +//! +//! 1. This task _must_ complete successfully for any switches that have ports +//! configured in the RSS network config in order for the RSS handoff to +//! complete. (This is consistent with the prior behavior of RSS blocking +//! until it found the switch zone IPs and then Nexus having to contact `dpd` +//! in those switch zone(s), but it would be nicer if we could proceed +//! through rack setup even in a degraded state.) +//! 2. This task assumes the set of switch port names never changes throughout +//! the lifetime of the rack. This is consistent with our usage of the +//! `switch_port` table; e.g., we don't have `DataStore` methods to delete +//! rows from it. But eventually we'll either have a non-sidecar switch that +//! may have different names, or we may want to break out additional ports on +//! sidecars, and we'll need to be able to handle the names changing at +//! runtime. This task is not set up to do that and will need some rework if +//! and when that world arrives. + +use crate::app::background::BackgroundTask; +use crate::app::dpd_clients; +use anyhow::Context; +use anyhow::anyhow; +use dpd_client::Client as DpdClient; +use dpd_client::types::PortId as DpdPortId; +use futures::future::BoxFuture; +use internal_dns_resolver::Resolver; +use nexus_auth::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::SwitchPortPopulatorStatus; +use nexus_types::internal_api::background::SwitchPortPopulatorStatusKind; +use omicron_common::api::external; +use omicron_common::api::external::Name; +use sled_agent_types::early_networking::SwitchSlot; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::sync::Arc; +use uuid::Uuid; + +pub struct SwitchPortPopulator { + rack_id: Uuid, + datastore: Arc, + resolver: Resolver, + did_populate_switches: BTreeSet, +} + +impl SwitchPortPopulator { + pub fn new( + rack_id: Uuid, + datastore: Arc, + resolver: Resolver, + ) -> Self { + Self { + rack_id, + datastore, + resolver, + did_populate_switches: BTreeSet::new(), + } + } + + async fn activate_impl( + &mut self, + opctx: &OpContext, + ) -> SwitchPortPopulatorStatus { + // Short circuit - if we've already successfully contacted both switch + // zones and populated the switch ports for them, we have nothing to do + // ever again. + if self.did_populate_switches.contains(&SwitchSlot::Switch0) + && self.did_populate_switches.contains(&SwitchSlot::Switch1) + { + return SwitchPortPopulatorStatus { + switch0: Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated), + switch1: Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated), + }; + } + + // Look up dendrite clients, keyed by which switch they're managing. + let clients_by_switch = match dpd_clients(&self.resolver, &opctx.log) + .await + { + Ok(clients) => clients, + Err(err) => { + // Ensure we're forced to update this logging (i.e., to + // `InlineErrorChain`) if `dpd_clients()` starts returning a + // different error type. + let err: &str = &err; + warn!( + opctx.log, "failed to look up dendrite clients"; + "error" => err, + ); + let err = format!("failed to look up dendrite clients: {err}"); + return SwitchPortPopulatorStatus { + switch0: Err(err.clone()), + switch1: Err(err), + }; + } + }; + + // Populate each switch, unless we've already done so successfully. + let switch0 = self + .populate_one_switch_if_needed( + SwitchSlot::Switch0, + &clients_by_switch, + opctx, + ) + .await; + let switch1 = self + .populate_one_switch_if_needed( + SwitchSlot::Switch1, + &clients_by_switch, + opctx, + ) + .await; + + SwitchPortPopulatorStatus { switch0, switch1 } + } + + async fn populate_one_switch_if_needed( + &mut self, + which_switch: SwitchSlot, + clients_by_switch: &HashMap, + opctx: &OpContext, + ) -> Result { + if self.did_populate_switches.contains(&which_switch) { + return Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated); + } + + let Some(client) = clients_by_switch.get(&which_switch) else { + warn!( + opctx.log, + "skipping (needed!) switch port population - \ + no dpd client found for this switch slot"; + "switch-slot" => ?which_switch, + ); + return Err("no dpd client found".to_string()); + }; + + match self + .try_populate_one_switch_via_dpd(which_switch, client, opctx) + .await + { + Ok(num_ports) => { + self.did_populate_switches.insert(which_switch); + Ok(SwitchPortPopulatorStatusKind::Populated { num_ports }) + } + Err(err) => { + warn!( + opctx.log, "failed to populate switch ports"; + "switch-slot" => ?which_switch, + InlineErrorChain::new(&*err), + ); + Err(format!( + "failed to populate switch ports: {}", + InlineErrorChain::new(&*err), + )) + } + } + } + + async fn try_populate_one_switch_via_dpd( + &self, + which_switch: SwitchSlot, + client: &DpdClient, + opctx: &OpContext, + ) -> anyhow::Result { + let all_ports = + client.port_list().await.context("failed to list ports")?; + + info!( + opctx.log, "discovered ports for switch"; + "switch-slot" => ?which_switch, + "all-ports" => ?all_ports, + ); + + let qsfp_ports = all_ports + .iter() + .filter_map(|port| match port { + DpdPortId::Internal(_) | DpdPortId::Rear(_) => None, + DpdPortId::Qsfp(_) => match port.to_string().parse::() { + Ok(name) => Some(name), + Err(err) => { + // Ensure we're forced to update this logging (i.e., to + // `InlineErrorChain`) if `parse()` starts returning a + // different error type. + let err: &str = &err; + warn!( + opctx.log, "failed to parse port as `Name`"; + "port" => %port, + "error" => err, + ); + None + } + }, + }) + .collect::>(); + + let mut processed = 0; + for port_name in qsfp_ports { + match self + .datastore + .switch_port_create( + opctx, + self.rack_id, + which_switch, + port_name.clone().into(), + ) + .await + { + Ok(_) | Err(external::Error::ObjectAlreadyExists { .. }) => { + processed += 1; + } + Err(err) => { + return Err(anyhow!(err).context(format!( + "failed to insert port {port_name}" + ))); + } + } + } + + Ok(processed) + } +} + +impl BackgroundTask for SwitchPortPopulator { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async move { + let status = self.activate_impl(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => { + let error = format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ); + serde_json::json!({ "error": error }) + } + } + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use assert_matches::assert_matches; + use nexus_db_queries::context::OpContext; + use nexus_test_utils_macros::nexus_test; + use slog::o; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + /// Exercises the example from our module's toplevel docs: switch0's + /// dendrite is reachable from the start and switch1's comes online later. + /// + /// The Nexus process has its own internal `SwitchPortPopulator` that + /// already ran against both switches at boot; this test instantiates a + /// fresh populator and drives it directly so we can observe the + /// only-switch0-then-both transition. + #[nexus_test(server = crate::Server, extra_sled_agents = 1)] + async fn test_switch1_comes_up_late(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Stop switch1's dendrite. The internal-DNS SRV record for it + // remains; `dpd_clients()` will still try to resolve and connect, + // but `switch_identifiers()` will fail and the slot will be + // dropped from the resulting map. + cptestctx.stop_dendrite(SwitchSlot::Switch1).await; + + let mut task = SwitchPortPopulator::new( + nexus.rack_id(), + datastore.clone(), + nexus.resolver().clone(), + ); + + // Phase 1: only switch0 reachable. + let v1: SwitchPortPopulatorStatus = + serde_json::from_value(task.activate(&opctx).await) + .expect("status deserializes"); + assert_matches!( + v1.switch0, + Ok(SwitchPortPopulatorStatusKind::Populated { .. }) + ); + assert_matches!( + v1.switch1, + Err(e) if e == "no dpd client found" + ); + + // Phase 2: re-activate with switch1 still down. Switch0 should + // short-circuit (no work), switch1 should still not be populated. + let v2: SwitchPortPopulatorStatus = + serde_json::from_value(task.activate(&opctx).await) + .expect("status deserializes"); + assert_matches!( + v2.switch0, + Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated) + ); + assert_matches!( + v2.switch1, + Err(e) if e == "no dpd client found" + ); + + // Phase 3: switch1 comes online. `restart_dendrite` waits for + // `switch_identifiers()` to succeed before returning, so by the + // time it returns, the next `dpd_clients()` call will pick up + // switch1. + cptestctx.restart_dendrite(SwitchSlot::Switch1).await; + + let v3: SwitchPortPopulatorStatus = + serde_json::from_value(task.activate(&opctx).await) + .expect("status deserializes"); + assert_matches!( + v3.switch0, + Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated) + ); + assert_matches!( + v3.switch1, + Ok(SwitchPortPopulatorStatusKind::Populated { .. }) + ); + + // Phase 4: short-circuit. The task should bail out early without + // touching dendrite or the database. + let v4: SwitchPortPopulatorStatus = + serde_json::from_value(task.activate(&opctx).await) + .expect("status deserializes"); + assert_matches!( + v4.switch0, + Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated) + ); + assert_matches!( + v4.switch1, + Ok(SwitchPortPopulatorStatusKind::PreviouslyPopulated) + ); + } +} diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index ee48bc602ba..5c31e9ab998 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -26,7 +26,6 @@ use nexus_types::external_api::networking; use nexus_types::external_api::policy; use nexus_types::external_api::silo; use nexus_types::external_api::sled as sled_types; -use nexus_types::internal_api::params::ExternalPortDiscovery; use nexus_types::inventory::SpType; use nexus_types::silo::silo_dns_name; use omicron_common::address::{Ipv6Subnet, RACK_PREFIX, get_64_subnet}; @@ -297,69 +296,6 @@ impl super::Nexus { Some(IpNet::from(rack_network_config.rack_subnet).into()); self.datastore().update_rack_subnet(opctx, &rack).await?; - // TODO - https://github.com/oxidecomputer/omicron/pull/3359 - // register all switches found during rack initialization - // identify requested switch from config and associate - // uplink records to that switch - match request.external_port_count { - ExternalPortDiscovery::Auto(switch_mgmt_addrs) => { - use dpd_client::Client as DpdClient; - info!(log, "Using automatic external switchport discovery"); - - for (switch, addr) in switch_mgmt_addrs { - let dpd_client = DpdClient::new( - &format!( - "http://[{}]:{}", - addr, - omicron_common::address::DENDRITE_PORT - ), - dpd_client::ClientState { - tag: "nexus".to_string(), - log: log.new(o!("component" => "DpdClient")), - }, - ); - - let all_ports = - dpd_client.port_list().await.map_err(|e| { - Error::internal_error(&format!("encountered error while discovering ports for {switch:#?}: {e}")) - })?; - - info!( - log, "discovered ports for switch"; - "switch_slot" => ?switch, - "all_ports" => #?all_ports, - ); - - let qsfp_ports: Vec = all_ports - .iter() - .filter(|port| { - matches!(port, dpd_client::types::PortId::Qsfp(_)) - }) - .map(|port| port.to_string().parse().unwrap()) - .collect(); - - info!( - log, "populating ports for switch"; - "switch_slot" => ?switch, - "qsfp_ports" => #?qsfp_ports, - ); - - self.populate_switch_ports(&opctx, &qsfp_ports, switch) - .await?; - } - } - // TODO: #3602 Eliminate need for static port mappings for switch ports - ExternalPortDiscovery::Static(port_mappings) => { - info!( - log, - "Using static configuration for external switchports" - ); - for (switch, ports) in port_mappings { - self.populate_switch_ports(&opctx, &ports, switch).await?; - } - } - } - // TODO // configure rack networking / boundary services here // Currently calling some of the apis directly, but should we be using diff --git a/nexus/src/app/switch_port.rs b/nexus/src/app/switch_port.rs index 0d0e9338fcb..f619725188f 100644 --- a/nexus/src/app/switch_port.rs +++ b/nexus/src/app/switch_port.rs @@ -16,7 +16,7 @@ use nexus_types::external_api::networking; use nexus_types::external_api::switch::SwitchLinkState; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::{ - self, CreateResult, DataPageParams, DeleteResult, Error, ListResultVec, + CreateResult, DataPageParams, DeleteResult, Error, ListResultVec, LookupResult, Name, NameOrId, UpdateResult, }; use sled_agent_types::early_networking::RouterPeerType; @@ -179,18 +179,6 @@ impl super::Nexus { self.db_datastore.switch_port_settings_get(opctx, name_or_id).await } - async fn switch_port_create( - &self, - opctx: &OpContext, - rack_id: Uuid, - switch_slot: SwitchSlot, - port: Name, - ) -> CreateResult { - self.db_datastore - .switch_port_create(opctx, rack_id, switch_slot, port.into()) - .await - } - pub(crate) async fn switch_port_list( &self, opctx: &OpContext, @@ -293,27 +281,6 @@ impl super::Nexus { Ok(()) } - pub(crate) async fn populate_switch_ports( - &self, - opctx: &OpContext, - ports: &[Name], - switch: SwitchSlot, - ) -> CreateResult<()> { - for port in ports { - match self - .switch_port_create(opctx, self.rack_id, switch, port.clone()) - .await - { - Ok(_) => {} - // ignore ObjectAlreadyExists but pass through other errors - Err(external::Error::ObjectAlreadyExists { .. }) => {} - Err(e) => return Err(e), - }; - } - - Ok(()) - } - pub(crate) async fn switch_port_status( &self, opctx: &OpContext, diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 0377af5ae1c..929342771a8 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -34,7 +34,6 @@ use nexus_db_queries::db; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::blueprint_zone_type; -use nexus_types::internal_api::params::ExternalPortDiscovery; use nexus_types::internal_api::params::InitialTrustQuorumConfig; use nexus_types::internal_api::params::{ PhysicalDiskPutRequest, ZpoolPutRequest, @@ -56,9 +55,9 @@ use sled_hardware_types::BaseboardId; use slog::Logger; use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; -use std::collections::HashMap; use std::net::{Ipv4Addr, SocketAddr, SocketAddrV6}; use std::sync::Arc; +use strum::IntoEnumIterator; use tokio::sync::watch; use uuid::Uuid; @@ -396,6 +395,28 @@ impl nexus_test_interface::NexusServer for Server { .await .unwrap(); + // In the real rack init handoff, the `rack_network_config` will contain + // at least one configured port, and handoff will only complete + // successfully if the `populate_switch_ports` background task has + // completed successfully to populate the corresponding qsfp* values in + // the `switch_port` table. We could block here until that task + // activates successfully, contacting whatever `dpd` instance has been + // stood up for this test, but in the interest of streamlining this + // handoff (which happens for _every_ Nexus test, including those + // completely uninterested in switch port interaction), we'll insert a + // single qsfp0 for each switch to the db directly. + for which_switch in SwitchSlot::iter() { + datastore + .switch_port_create( + &opctx, + config.deployment.rack_id, + which_switch, + nexus_db_model::Name("qsfp0".parse().unwrap()), + ) + .await + .expect("inserted qsfp0"); + } + // Allocation of initial external IP addresses is a little funny. In // a real system, it'd be allocated by RSS and provided with the rack // initialization request (which we're about to simulate). RSS also @@ -436,18 +457,6 @@ impl nexus_test_interface::NexusServer for Server { internal_dns_zone_config, external_dns_zone_name: external_dns_zone_name.to_owned(), recovery_silo, - external_port_count: ExternalPortDiscovery::Static( - HashMap::from([ - ( - SwitchSlot::Switch0, - vec!["qsfp0".parse().unwrap()], - ), - ( - SwitchSlot::Switch1, - vec!["qsfp0".parse().unwrap()], - ), - ]), - ), rack_network_config: RackNetworkConfig { rack_subnet: "fd00:1122:3344:0100::/56" .parse() diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index d0e418398fd..01928765c70 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -225,6 +225,7 @@ audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 audit_log_cleanup.period_secs = 600 audit_log_cleanup.retention_days = 90 audit_log_cleanup.max_deleted_per_activation = 10000 +populate_switch_ports.period_secs = 30 [multicast] # Enable multicast functionality for tests (disabled by default in production) diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 5b30442fb12..b3fac1dde9b 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -1147,6 +1147,26 @@ pub struct AuditLogCleanupStatus { pub error: Option, } +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum SwitchPortPopulatorStatusKind { + /// A previous activation of this task had already populated the ports for + /// this switch. + PreviouslyPopulated, + + /// We successfully populated the ports of this switch. + Populated { num_ports: usize }, +} + +/// The status of a `populate_switch_ports` background task activation. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct SwitchPortPopulatorStatus { + /// Result of populating switch 0's ports, if any. + pub switch0: Result, + /// Result of populating switch 1's ports, if any. + pub switch1: Result, +} + /// The status of a `session_cleanup` background task activation. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct SessionCleanupStatus { diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index 3726a8e817e..9313cfd2925 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -22,16 +22,13 @@ use omicron_uuid_kinds::ZpoolUuid; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_agent_types::early_networking::RackNetworkConfig; -use sled_agent_types::early_networking::SwitchSlot; use sled_agent_types::inventory::SledCpuFamily; use sled_agent_types::inventory::SledRole; use sled_agent_types::inventory::SourceNatConfigGeneric; use sled_hardware_types::BaseboardId; use std::collections::BTreeSet; -use std::collections::HashMap; use std::fmt; use std::net::IpAddr; -use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use uuid::Uuid; @@ -197,8 +194,6 @@ pub struct RackInitializationRequest { pub external_dns_zone_name: String, /// configuration for the initial (recovery) Silo pub recovery_silo: RecoverySiloConfig, - /// The external qsfp ports per sidecar - pub external_port_count: ExternalPortDiscovery, /// Initial rack network configuration pub rack_network_config: RackNetworkConfig, /// IPs or subnets allowed to make requests to user-facing services @@ -212,15 +207,6 @@ pub struct RackInitializationRequest { pub initial_trust_quorum_configuration: Option, } -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -pub enum ExternalPortDiscovery { - // Automatically discover ports via Dendrite - Auto(HashMap), - // Static configuration pairing switches with a collection of ports - Static(HashMap>), -} - pub type DnsConfigParams = internal_dns_types::config::DnsConfigParams; pub type DnsConfigZone = internal_dns_types::config::DnsConfigZone; pub type DnsRecord = internal_dns_types::config::DnsRecord; diff --git a/openapi/nexus-lockstep.json b/openapi/nexus-lockstep.json index 05b3df2b826..0aee2fa5b34 100644 --- a/openapi/nexus-lockstep.json +++ b/openapi/nexus-lockstep.json @@ -4248,44 +4248,6 @@ "type": "string", "format": "uuid" }, - "ExternalPortDiscovery": { - "oneOf": [ - { - "type": "object", - "properties": { - "auto": { - "type": "object", - "additionalProperties": { - "type": "string", - "format": "ipv6" - } - } - }, - "required": [ - "auto" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "static": { - "type": "object", - "additionalProperties": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Name" - } - } - } - }, - "required": [ - "static" - ], - "additionalProperties": false - } - ] - }, "FailedHostOsUpdateReason": { "description": "Describes the reason why a Host OS failed to update", "oneOf": [ @@ -8238,14 +8200,6 @@ "description": "delegated DNS name for external DNS", "type": "string" }, - "external_port_count": { - "description": "The external qsfp ports per sidecar", - "allOf": [ - { - "$ref": "#/components/schemas/ExternalPortDiscovery" - } - ] - }, "initial_trust_quorum_configuration": { "nullable": true, "description": "Data used to write the initial trust quorum configuration to CRDB\n\nThis is optional for two reasons: * For clusters fewer than 3 nodes, we don't support trust quorum. * Trust quorum is not fully complete yet, and we only want this to be used in production once it is complete.", @@ -8307,7 +8261,6 @@ "certs", "crucible_datasets", "external_dns_zone_name", - "external_port_count", "internal_dns_zone_config", "internal_services_ip_pool_ranges", "physical_disks", diff --git a/sled-agent/rack-setup/src/service.rs b/sled-agent/rack-setup/src/service.rs index f35bcd6734a..6e004021889 100644 --- a/sled-agent/rack-setup/src/service.rs +++ b/sled-agent/rack-setup/src/service.rs @@ -67,7 +67,7 @@ //! after a clean slate upon failure. //! See for details. -use crate::early_networking::{EarlyNetworkSetup, EarlyNetworkSetupError}; +use crate::early_networking::EarlyNetworkSetupError; use crate::plan::service::PlanError as ServicePlanError; use crate::plan::service::ServicePlan; use crate::plan::sled::SledPlan; @@ -88,7 +88,6 @@ use nexus_lockstep_client::{ use nexus_types::deployment::{ Blueprint, BlueprintZoneType, blueprint_zone_type, }; -use nexus_types::internal_api::params::ExternalPortDiscovery; use ntp_admin_client::ClientInfo as _; use ntp_admin_client::{ Client as NtpAdminClient, Error as NtpAdminError, types::TimeSync, @@ -822,14 +821,12 @@ impl ServiceInner { Ok(()) } - #[allow(clippy::too_many_arguments)] async fn handoff_to_nexus( &self, blueprint: Blueprint, config: &Config, sled_plan: &SledPlan, service_plan: &ServicePlan, - port_discovery_mode: ExternalPortDiscovery, nexus_lockstep_address: SocketAddrV6, initial_trust_quorum_configuration: Option, ) -> Result<(), SetupServiceError> { @@ -1077,7 +1074,6 @@ impl ServiceInner { external_dns_zone_name: config.external_dns_zone_name.clone(), recovery_silo: config.recovery_silo.clone(), rack_network_config, - external_port_count: port_discovery_mode, allowed_source_ips, initial_trust_quorum_configuration, }; @@ -1447,28 +1443,6 @@ impl ServiceInner { rss_step.update(RssStep::ConfigureDns); self.initialize_internal_dns_records(&service_plan).await?; - // Ask MGS in each switch zone which switch it is. - // - // lookup_uplinked_switch_zone_underlay_addrs() is shared with - // sled-agent, which has the network config in a watch channel that - // changes as Nexus pushes updates in via the bootstore. We don't have - // that, but can stuff the (unchanging) config into a watch channel to - // fit this API. - let (_tx, network_config_rx) = watch::channel(system_networking_config); - let switch_mgmt_addrs = EarlyNetworkSetup::new(&self.log) - .lookup_uplinked_switch_zone_underlay_addrs( - &resolver, - &network_config_rx, - // We willing to wait forever to find all the switches that have - // configured uplinks; if we attempt to proceed without doing - // so, we'll fail handing off to Nexus later. (Ideally we could - // complete RSS with only one switch up and then configure the - // second later, but that currently doesn't work.) - // - Duration::MAX, - ) - .await; - rss_step.update(RssStep::InitNtp); // Next start up the NTP services. let v3generator = v2generator.new_version_with( @@ -1583,7 +1557,6 @@ impl ServiceInner { &config, &sled_plan, &service_plan, - ExternalPortDiscovery::Auto(switch_mgmt_addrs), nexus_lockstep_address, initial_trust_quorum_configuration, ) diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index c98cd662088..a20acd0ee3c 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -31,7 +31,6 @@ use nexus_types::deployment::{ use nexus_types::deployment::{ BlueprintZoneConfig, BlueprintZoneDisposition, BlueprintZoneType, }; -use nexus_types::internal_api::params::ExternalPortDiscovery; use omicron_common::FileKv; use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; use omicron_common::address::{DNS_OPTE_IPV4_SUBNET, Ipv6Subnet}; @@ -60,7 +59,6 @@ use sled_agent_types::inventory::NetworkInterface; use sled_agent_types::inventory::NetworkInterfaceKind; use sled_agent_types::inventory::OmicronZoneDataset; use slog::{Drain, Logger, info}; -use std::collections::HashMap; use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; @@ -650,7 +648,6 @@ pub async fn run_standalone_server( internal_dns_zone_config: dns_config, external_dns_zone_name: DNS_ZONE_EXTERNAL_TESTING.to_owned(), recovery_silo, - external_port_count: ExternalPortDiscovery::Static(HashMap::new()), rack_network_config: RackNetworkConfig { rack_subnet: Ipv6Net::host_net(Ipv6Addr::LOCALHOST), infra_ip_first: IpAddr::V4(Ipv4Addr::LOCALHOST), diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 7a43386e375..2cbbfc36034 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -127,6 +127,7 @@ audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 audit_log_cleanup.period_secs = 600 audit_log_cleanup.retention_days = 90 audit_log_cleanup.max_deleted_per_activation = 10000 +populate_switch_ports.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 11863e1c681..93e255dfbb0 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -127,6 +127,7 @@ audit_log_timeout_incomplete.max_timed_out_per_activation = 1000 audit_log_cleanup.period_secs = 600 audit_log_cleanup.retention_days = 90 audit_log_cleanup.max_deleted_per_activation = 10000 +populate_switch_ports.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds.