From c8abc18ea553bb90bda633e2268f3ee129a37679 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 May 2026 14:22:09 -0700 Subject: [PATCH 1/4] [fm] Toggle to disable fm_analysis background task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alternative to #10358 (the comment-out approach). Adds an opt-in `fm.analysis_enabled` flag (default false, via #[serde(default)]) to FmTasksConfig, threaded through FmAnalysis::new and short-circuited in activate() with a 'disabled' marker on the status — matching the existing convention used by support_bundle_collector, instance_updater, sp_ereport_ingester, and friends. Customer-shipped smf/nexus/{single-sled,multi-sled}/config-partial.toml inherit the default (off); test/example configs explicitly set true so omdb golden output and dev/CI behavior are unchanged. No #[allow(dead_code)] warts and no golden-file churn. Re-enable in production: flip analysis_enabled = true in the config-partial.toml files (or change the default). Closes #10348 once the analysis subsystem is ready. --- dev-tools/omdb/src/bin/omdb/nexus.rs | 7 +++++ nexus-config/src/nexus_config.rs | 15 +++++++++++ nexus/examples/config-second.toml | 1 + nexus/examples/config.toml | 1 + nexus/src/app/background/init.rs | 1 + nexus/src/app/background/tasks/fm_analysis.rs | 27 +++++++++++++++++-- nexus/tests/config.test.toml | 1 + nexus/types/src/internal_api/background.rs | 3 +++ 8 files changed, 54 insertions(+), 2 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 2bec4896d33..21171c33498 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -3514,6 +3514,13 @@ fn print_task_fm_analysis(details: &serde_json::Value) { println!(" FAULT MANAGEMENT ANALYSIS SUMMARY"); println!(" ================================="); let (prep_status, analysis_status) = match outcome { + Outcome::Disabled => { + println!( + " fault management analysis explicitly disabled \ + by config!" + ); + return; + } Outcome::WaitingForInventory => { println!( " analysis was not performed, as the inventory has\n \ diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index c877645a239..011cb3f9ef8 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -968,9 +968,22 @@ impl Default for MulticastGroupReconcilerConfig { } } +/// Default for [`FmTasksConfig::analysis_enabled`]. +/// +/// To re-enable fault management analysis fleet-wide once the subsystem is +/// ready (see omicron#10348), change the body of this function to return +/// `true`. This is the single source of truth for both the serde +/// "missing field" default and `FmTasksConfig::default()`. +fn default_fm_analysis_enabled() -> bool { + false +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct FmTasksConfig { + /// whether the fault management analysis background task runs. + #[serde(default = "default_fm_analysis_enabled")] + pub analysis_enabled: bool, /// period (in seconds) for periodic activations of the background task that /// drives fault management analysis. #[serde_as(as = "DurationSeconds")] @@ -993,6 +1006,7 @@ pub struct FmTasksConfig { impl Default for FmTasksConfig { fn default() -> Self { Self { + analysis_enabled: default_fm_analysis_enabled(), // Analysis is generally triggered by changes in the current sitrep, // inventory, or by the ereport ingester(s), so it need not be // periodically activated all that frequently. @@ -1575,6 +1589,7 @@ mod test { disable: false, }, fm: FmTasksConfig { + analysis_enabled: default_fm_analysis_enabled(), analysis_period_secs: Duration::from_secs(52), sitrep_load_period_secs: Duration::from_secs(48), sitrep_gc_period_secs: Duration::from_secs(49), diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 3c1a1a3700a..6196f141184 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -174,6 +174,7 @@ sp_ereport_ingester.period_secs = 30 # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +fm.analysis_enabled = true # How frequently to run analysis from the current sitrep. fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index b4026bfb1de..150d6943170 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -158,6 +158,7 @@ sp_ereport_ingester.period_secs = 30 # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +fm.analysis_enabled = true # How frequently to run analysis from the current sitrep. fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index f7d231c5166..bdf9ee01638 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -1150,6 +1150,7 @@ impl BackgroundTasksInitializer { sitrep_gc: task_fm_sitrep_gc.clone(), }, nexus_id, + config.fm.analysis_enabled, ); driver.register(TaskDefinition { name: "fm_analysis", diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index 78491c4d97d..1f49c805fc1 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -31,6 +31,9 @@ pub struct FmAnalysis { inv_rx: watch::Receiver>>, activators: Activators, nexus_id: OmicronZoneUuid, + /// When false, `activate()` short-circuits and reports the task as + /// disabled. + analysis_enabled: bool, } /// This is just because I don't like it when a constructor takes multiple @@ -48,7 +51,19 @@ impl BackgroundTask for FmAnalysis { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { Box::pin(async { - let status = self.actually_activate(opctx).await; + let status = if self.analysis_enabled { + self.actually_activate(opctx).await + } else { + slog::info!( + opctx.log, + "fault management analysis explicitly disabled by config", + ); + FmAnalysisStatus { + parent_sitrep_id: None, + inv_collection_id: None, + outcome: status::Outcome::Disabled, + } + }; match serde_json::to_value(status) { Ok(val) => val, Err(err) => { @@ -70,8 +85,16 @@ impl FmAnalysis { inv_rx: watch::Receiver>>, activators: Activators, nexus_id: OmicronZoneUuid, + analysis_enabled: bool, ) -> Self { - Self { datastore, sitrep_rx, inv_rx, activators, nexus_id } + Self { + datastore, + sitrep_rx, + inv_rx, + activators, + nexus_id, + analysis_enabled, + } } async fn actually_activate( diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index a32a9b86081..d0e418398fd 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -198,6 +198,7 @@ sp_ereport_ingester.disable = true # How frequently to check for a new fault management sitrep (made by any Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +fm.analysis_enabled = true # How frequently to run analysis from the current sitrep. fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index cbf09135bde..5b30442fb12 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -932,6 +932,9 @@ pub mod fm_analysis { #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] #[allow(clippy::large_enum_variant)] pub enum Outcome { + /// The task is disabled by config. + Disabled, + /// Fault management analysis was not performed, as no inventory /// collection has been loaded. WaitingForInventory, From 615b81e2e8953d912d65563380de4d11d9ba4647 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 May 2026 14:54:44 -0700 Subject: [PATCH 2/4] [fm] fix unit-test call sites for FmAnalysis::new CI flagged five FmAnalysis::new() calls in fm_analysis.rs's tests module that hadn't been updated for the new analysis_enabled parameter. cargo check (without --tests) and cargo nextest -p nexus-config didn't catch them; cargo clippy --all-targets did. Threads a const ANALYSIS_ENABLED = true through each call site so the intent ('these tests exercise the analysis path') is named at the call site rather than a bare `true`. --- nexus/src/app/background/tasks/fm_analysis.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index 1f49c805fc1..aba1135203f 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -372,6 +372,11 @@ mod tests { use omicron_test_utils::dev; use omicron_uuid_kinds::SitrepUuid; + /// These tests exercise the analysis path; the toggle stays on so + /// `actually_activate()` (which is what the tests call directly) is + /// reached by the same code path it would be in production-with-toggle-on. + const ANALYSIS_ENABLED: bool = true; + fn activators() -> Activators { let a = Activators { inventory_loader: Activator::new(), @@ -455,6 +460,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -487,6 +493,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -512,6 +519,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -541,6 +549,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -570,6 +579,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; From b86344526e5a5b3bf5bc176b44335bd704a5440d Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 4 May 2026 09:34:53 -0700 Subject: [PATCH 3/4] Trim a little bit --- nexus-config/src/nexus_config.rs | 6 +----- nexus/src/app/background/tasks/fm_analysis.rs | 5 ----- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 011cb3f9ef8..cdb7faa8404 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -969,12 +969,8 @@ impl Default for MulticastGroupReconcilerConfig { } /// Default for [`FmTasksConfig::analysis_enabled`]. -/// -/// To re-enable fault management analysis fleet-wide once the subsystem is -/// ready (see omicron#10348), change the body of this function to return -/// `true`. This is the single source of truth for both the serde -/// "missing field" default and `FmTasksConfig::default()`. fn default_fm_analysis_enabled() -> bool { + // TODO(10349): Flip to true false } diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index aba1135203f..c56662da2e9 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -31,8 +31,6 @@ pub struct FmAnalysis { inv_rx: watch::Receiver>>, activators: Activators, nexus_id: OmicronZoneUuid, - /// When false, `activate()` short-circuits and reports the task as - /// disabled. analysis_enabled: bool, } @@ -372,9 +370,6 @@ mod tests { use omicron_test_utils::dev; use omicron_uuid_kinds::SitrepUuid; - /// These tests exercise the analysis path; the toggle stays on so - /// `actually_activate()` (which is what the tests call directly) is - /// reached by the same code path it would be in production-with-toggle-on. const ANALYSIS_ENABLED: bool = true; fn activators() -> Activators { From 3f9480d17d2d376b06256e040b1aec9c73e2426d Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 4 May 2026 10:01:13 -0700 Subject: [PATCH 4/4] # --- nexus-config/src/nexus_config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index cdb7faa8404..9f035510bc9 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -970,7 +970,7 @@ impl Default for MulticastGroupReconcilerConfig { /// Default for [`FmTasksConfig::analysis_enabled`]. fn default_fm_analysis_enabled() -> bool { - // TODO(10349): Flip to true + // TODO(#10349): Flip to true false }