diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 2bec4896d33..21171c33498 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -3514,6 +3514,13 @@ fn print_task_fm_analysis(details: &serde_json::Value) { println!(" FAULT MANAGEMENT ANALYSIS SUMMARY"); println!(" ================================="); let (prep_status, analysis_status) = match outcome { + Outcome::Disabled => { + println!( + " fault management analysis explicitly disabled \ + by config!" + ); + return; + } Outcome::WaitingForInventory => { println!( " analysis was not performed, as the inventory has\n \ diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index c877645a239..9f035510bc9 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -968,9 +968,18 @@ impl Default for MulticastGroupReconcilerConfig { } } +/// Default for [`FmTasksConfig::analysis_enabled`]. +fn default_fm_analysis_enabled() -> bool { + // TODO(#10349): Flip to true + false +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct FmTasksConfig { + /// whether the fault management analysis background task runs. + #[serde(default = "default_fm_analysis_enabled")] + pub analysis_enabled: bool, /// period (in seconds) for periodic activations of the background task that /// drives fault management analysis. #[serde_as(as = "DurationSeconds")] @@ -993,6 +1002,7 @@ pub struct FmTasksConfig { impl Default for FmTasksConfig { fn default() -> Self { Self { + analysis_enabled: default_fm_analysis_enabled(), // Analysis is generally triggered by changes in the current sitrep, // inventory, or by the ereport ingester(s), so it need not be // periodically activated all that frequently. @@ -1575,6 +1585,7 @@ mod test { disable: false, }, fm: FmTasksConfig { + analysis_enabled: default_fm_analysis_enabled(), analysis_period_secs: Duration::from_secs(52), sitrep_load_period_secs: Duration::from_secs(48), sitrep_gc_period_secs: Duration::from_secs(49), diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 3c1a1a3700a..6196f141184 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -174,6 +174,7 @@ sp_ereport_ingester.period_secs = 30 # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +fm.analysis_enabled = true # How frequently to run analysis from the current sitrep. fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index b4026bfb1de..150d6943170 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -158,6 +158,7 @@ sp_ereport_ingester.period_secs = 30 # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +fm.analysis_enabled = true # How frequently to run analysis from the current sitrep. fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index f7d231c5166..bdf9ee01638 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -1150,6 +1150,7 @@ impl BackgroundTasksInitializer { sitrep_gc: task_fm_sitrep_gc.clone(), }, nexus_id, + config.fm.analysis_enabled, ); driver.register(TaskDefinition { name: "fm_analysis", diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index 78491c4d97d..c56662da2e9 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -31,6 +31,7 @@ pub struct FmAnalysis { inv_rx: watch::Receiver>>, activators: Activators, nexus_id: OmicronZoneUuid, + analysis_enabled: bool, } /// This is just because I don't like it when a constructor takes multiple @@ -48,7 +49,19 @@ impl BackgroundTask for FmAnalysis { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { Box::pin(async { - let status = self.actually_activate(opctx).await; + let status = if self.analysis_enabled { + self.actually_activate(opctx).await + } else { + slog::info!( + opctx.log, + "fault management analysis explicitly disabled by config", + ); + FmAnalysisStatus { + parent_sitrep_id: None, + inv_collection_id: None, + outcome: status::Outcome::Disabled, + } + }; match serde_json::to_value(status) { Ok(val) => val, Err(err) => { @@ -70,8 +83,16 @@ impl FmAnalysis { inv_rx: watch::Receiver>>, activators: Activators, nexus_id: OmicronZoneUuid, + analysis_enabled: bool, ) -> Self { - Self { datastore, sitrep_rx, inv_rx, activators, nexus_id } + Self { + datastore, + sitrep_rx, + inv_rx, + activators, + nexus_id, + analysis_enabled, + } } async fn actually_activate( @@ -349,6 +370,8 @@ mod tests { use omicron_test_utils::dev; use omicron_uuid_kinds::SitrepUuid; + const ANALYSIS_ENABLED: bool = true; + fn activators() -> Activators { let a = Activators { inventory_loader: Activator::new(), @@ -432,6 +455,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -464,6 +488,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -489,6 +514,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -518,6 +544,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; @@ -547,6 +574,7 @@ mod tests { inv_rx, activators(), OmicronZoneUuid::new_v4(), + ANALYSIS_ENABLED, ); let result = task.actually_activate(opctx).await; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index a32a9b86081..d0e418398fd 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -198,6 +198,7 @@ sp_ereport_ingester.disable = true # How frequently to check for a new fault management sitrep (made by any Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +fm.analysis_enabled = true # How frequently to run analysis from the current sitrep. fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index cbf09135bde..5b30442fb12 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -932,6 +932,9 @@ pub mod fm_analysis { #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] #[allow(clippy::large_enum_variant)] pub enum Outcome { + /// The task is disabled by config. + Disabled, + /// Fault management analysis was not performed, as no inventory /// collection has been loaded. WaitingForInventory,