diff --git a/dev-tools/reconfigurator-cli/tests/input/cmds-plan-with-no-in-service-sleds.txt b/dev-tools/reconfigurator-cli/tests/input/cmds-plan-with-no-in-service-sleds.txt new file mode 100644 index 00000000000..bdb1a1e5d5f --- /dev/null +++ b/dev-tools/reconfigurator-cli/tests/input/cmds-plan-with-no-in-service-sleds.txt @@ -0,0 +1,15 @@ +# Test for planning without any in-service sleds. + +load-example --seed test-basic --nsleds 1 + +# Expunge the only sled. Now no sleds match SledFilter::InService. +sled-set serial0 policy expunged + +# Generate a fresh inventory so the planner sees a state matching the +# system description. +inventory-generate + +# Plan a new blueprint. We expect this to say that zone adds and updates are +# blocked, with the reason being that there are no in-service sleds to plan +# against. We expect this not to produce a programming error. +blueprint-plan latest latest diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-plan-with-no-in-service-sleds-stderr b/dev-tools/reconfigurator-cli/tests/output/cmds-plan-with-no-in-service-sleds-stderr new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-plan-with-no-in-service-sleds-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-plan-with-no-in-service-sleds-stdout new file mode 100644 index 00000000000..584343688f2 --- /dev/null +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-plan-with-no-in-service-sleds-stdout @@ -0,0 +1,38 @@ +using provided RNG seed: reconfigurator-cli-test +> # Test for planning without any in-service sleds. + +> load-example --seed test-basic --nsleds 1 +loaded example system with: +- collection: 9e187896-7809-46d0-9210-d75be1b3c4d4 +- blueprint: ade5749d-bdf3-4fab-a8ae-00bea01b3a5a + + +> # Expunge the only sled. Now no sleds match SledFilter::InService. +> sled-set serial0 policy expunged +set sled 89d02b1b-478c-401a-8e28-7a26f74fa41b policy to expunged + + +> # Generate a fresh inventory so the planner sees a state matching the +> # system description. +> inventory-generate +generated inventory collection 972ca69a-384c-4a9c-a87d-c2cf21e114e0 from configured sleds + + +> # Plan a new blueprint. We expect this to say that zone adds and updates are +> # blocked, with the reason being that there are no in-service sleds to plan +> # against. We expect this not to produce a programming error. +> blueprint-plan latest latest +INFO skipping noop image source check for all sleds, reason: no target release is currently set +generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a +blueprint source: planner with report: +planning report: +* zone adds waiting on blockers +* zone adds and updates are blocked: + - no in-service sleds to plan against +* adding zones despite being blocked, because: target release generation is 1 +* zone updates waiting on zone add blockers +Measurement updates: +Waiting on zone add/update blockers + + + diff --git a/nexus/reconfigurator/planning/src/measurements.rs b/nexus/reconfigurator/planning/src/measurements.rs index 5f5c15f85af..a4d58f87681 100644 --- a/nexus/reconfigurator/planning/src/measurements.rs +++ b/nexus/reconfigurator/planning/src/measurements.rs @@ -48,7 +48,10 @@ pub(crate) fn plan_measurement_updates( ) -> Result { match (current_artifacts, previous_artifacts) { // Very first blueprint, we should never get here because planner logic - // should prevent us from proceding here until we have a TUF repo + // should prevent us from proceding here until we have a TUF repo. + // Specifically, `Planner::plan_impl` short-circuits to + // `PlanningMeasurementUpdatesStepReport::NoTargetRelease` before + // calling `do_plan_measurements` when both repos are `Initial`. ( TargetReleaseDescription::Initial, TargetReleaseDescription::Initial, diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 8214098bcda..a2d038b4ea8 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -46,6 +46,7 @@ use nexus_types::deployment::DiskFilter; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledDetails; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::TargetReleaseDescription; use nexus_types::deployment::TufRepoContentsError; use nexus_types::deployment::ZpoolFilter; use nexus_types::deployment::{ @@ -269,13 +270,31 @@ impl<'a> Planner<'a> { let add_update_blocked_reasons = self.should_plan_add_or_update(&actions_by_sled)?; - // We need to finish mupdate overrides before updating measurements - // This also ensures we do not plan measurement updates until - // all measurements are artifacts - let measurement_updates = if add_update_blocked_reasons.is_empty() { - self.do_plan_measurements()? - } else { + // We need to finish mupdate overrides before updating measurements. + // This also ensures we do not plan measurement updates until all + // measurements are artifacts. + // + // Measurement planning has a stricter precondition than zone adds: it + // also requires a TUF repo to be set. We check that explicitly here so + // that `do_plan_measurements` can treat `(Initial, Initial)` as an + // unreachable invariant violation (see + // `crate::measurements::plan_measurement_updates`). On a freshly + // initialized rack, or any other state where every visible blueprint + // zone is on the install dataset and no TUF repo has been configured, + // there's simply nothing to plan. + let no_target_release = matches!( + self.input.tuf_repo().description(), + TargetReleaseDescription::Initial, + ) && matches!( + self.input.old_repo().description(), + TargetReleaseDescription::Initial, + ); + let measurement_updates = if !add_update_blocked_reasons.is_empty() { PlanningMeasurementUpdatesStepReport::BlockedAddUpdate + } else if no_target_release { + PlanningMeasurementUpdatesStepReport::NoTargetRelease + } else { + self.do_plan_measurements()? }; // Only plan MGS-based updates once we've finished updating our @@ -301,10 +320,18 @@ impl<'a> Planner<'a> { let target_release_generation_is_one = self.input.tuf_repo().target_release_generation == Generation::from_u32(1); - let mut add = if add_update_blocked_reasons.is_empty() - || add_zones_with_mupdate_override - || target_release_generation_is_one - || measurement_updates.all_sleds_updated() + // If there are no in-service sleds, there's nothing to plan adds + // against and downstream allocators (e.g. the rack-subnet lookup in + // `available_internal_dns_subnets`) will fail. Short-circuit here. + // The corresponding blocker reason was already recorded by + // `should_plan_add_or_update` (see condition 5 in that function). + let has_in_service_sleds = + self.input.all_sled_ids(SledFilter::InService).next().is_some(); + let mut add = if has_in_service_sleds + && (add_update_blocked_reasons.is_empty() + || add_zones_with_mupdate_override + || target_release_generation_is_one + || measurement_updates.all_sleds_updated()) { self.do_plan_add(&mgs_updates)? } else { @@ -2120,6 +2147,13 @@ impl<'a> Planner<'a> { // Again, this is driven primarily by the desire to minimize the // number of versions of system software running at any time. // + // 5. If there are no in-service sleds at all, there's nothing to plan + // against. This is a degenerate state (on a production system, where + // is the Nexus running code making these decisions?) but it must + // be made explicit here: several downstream steps, including + // `do_plan_add`'s call to `available_internal_dns_subnets`, + // implicitly assume the loops over in-service sleds are non-empty.. + // // What does "any sleds" mean in this context? We don't need to care // about decommissioned or expunged sleds, so we consider in-service // sleds. @@ -2277,6 +2311,11 @@ impl<'a> Planner<'a> { } } + // Condition 5 above. + if self.input.all_sled_ids(SledFilter::InService).next().is_none() { + reasons.push("no in-service sleds to plan against".to_owned()); + } + Ok(reasons) } diff --git a/nexus/types/src/deployment/planning_report.rs b/nexus/types/src/deployment/planning_report.rs index c615b9c0923..a4171abd376 100644 --- a/nexus/types/src/deployment/planning_report.rs +++ b/nexus/types/src/deployment/planning_report.rs @@ -784,12 +784,23 @@ impl IdOrdItem for BlockedMgsUpdate { #[serde(rename_all = "snake_case", tag = "type")] #[cfg_attr(test, derive(test_strategy::Arbitrary))] pub enum PlanningMeasurementUpdatesStepReport { - Modified { count: usize }, + Modified { + count: usize, + }, BlockedAddUpdate, EmptyMeasurements, - NoSledAgentInInventory { sled_id: SledUuid }, - NoSledConfig { sled_id: SledUuid }, - StillInstallDataset { sled_id: SledUuid }, + /// The system has no target release set, so there are no measurements to + /// plan against. + NoTargetRelease, + NoSledAgentInInventory { + sled_id: SledUuid, + }, + NoSledConfig { + sled_id: SledUuid, + }, + StillInstallDataset { + sled_id: SledUuid, + }, WaitingOnInventory, MatchesInventory, } @@ -827,6 +838,12 @@ impl fmt::Display for PlanningMeasurementUpdatesStepReport { "attempted to plan with a empty measurement set" )?; } + PlanningMeasurementUpdatesStepReport::NoTargetRelease => { + writeln!( + f, + "no target release set; nothing to plan" + )?; + } PlanningMeasurementUpdatesStepReport::NoSledAgentInInventory { sled_id }=> { writeln!( f, diff --git a/openapi/nexus-lockstep.json b/openapi/nexus-lockstep.json index b20bf509b47..b2de1e35993 100644 --- a/openapi/nexus-lockstep.json +++ b/openapi/nexus-lockstep.json @@ -7165,6 +7165,21 @@ "type" ] }, + { + "description": "The system has no target release set, so there are no measurements to plan against.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "no_target_release" + ] + } + }, + "required": [ + "type" + ] + }, { "type": "object", "properties": {