diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 87122a6a8b5..9d60b21ed9c 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -1622,7 +1622,7 @@ impl DbArgs { ).await }, DbCommands::Ereport(args) => { - cmd_db_ereport(&datastore, &fetch_opts, &args).await + cmd_db_ereport(omdb, log, &datastore, &fetch_opts, &args).await } DbCommands::UserDataExport(args) => { args.exec(&omdb, &opctx, &datastore).await diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 465d762d0aa..b910f785db7 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -6,6 +6,8 @@ use super::DbFetchOptions; use super::check_limit; +use crate::Omdb; +use crate::helpers::CONNECTION_OPTIONS_HEADING; use crate::helpers::const_max_len; use crate::helpers::datetime_opt_rfc3339_concise; use crate::helpers::datetime_rfc3339_concise; @@ -23,6 +25,7 @@ use clap::Subcommand; use diesel::AggregateExpressionMethods; use diesel::dsl::{count, min}; use diesel::prelude::*; +use internal_dns_types::names::ServiceName; use nexus_db_lookup::DbConnection; use nexus_db_model::ereport as model; use nexus_db_model::ereport::DbEna; @@ -55,6 +58,23 @@ enum Commands { /// List ereport reporters Reporters(ReportersArgs), + + /// Summarize ereports by class, marking which classes a diagnosis engine + /// in Nexus consumes (fetched from Nexus's lockstep API; falls back to + /// `?` if Nexus is unreachable). + Classes(ClassesArgs), +} + +#[derive(Debug, Args, Clone)] +struct ClassesArgs { + /// URL of the Nexus lockstep API. If not provided, looks up an instance + /// in internal DNS. + #[clap( + long, + env = "OMDB_NEXUS_URL", + help_heading = CONNECTION_OPTIONS_HEADING, + )] + nexus_internal_url: Option, } #[derive(Debug, Args, Clone)] @@ -100,6 +120,8 @@ struct ReportersArgs { } pub(super) async fn cmd_db_ereport( + omdb: &Omdb, + log: &slog::Logger, datastore: &DataStore, fetch_opts: &DbFetchOptions, args: &EreportArgs, @@ -115,6 +137,10 @@ pub(super) async fn cmd_db_ereport( Commands::Reporters(ref args) => { cmd_db_ereporters(datastore, args).await } + + Commands::Classes(ref args) => { + cmd_db_ereport_classes(omdb, log, datastore, args).await + } } } @@ -466,3 +492,197 @@ async fn cmd_db_ereporters( Ok(()) } + +async fn cmd_db_ereport_classes( + omdb: &Omdb, + log: &slog::Logger, + datastore: &DataStore, + args: &ClassesArgs, +) -> anyhow::Result<()> { + use std::collections::BTreeMap; + use std::collections::BTreeSet; + + // Try to fetch the known list from Nexus. If anything fails, fall back + // to "?" for every row — DB totals are still useful even without Nexus. + let known_from_nexus = + fetch_known_classes_from_nexus(omdb, log, args).await; + let known: BTreeSet = match &known_from_nexus { + Ok(list) => list.iter().cloned().collect(), + Err(err) => { + eprintln!( + "warning: could not fetch known ereport classes from Nexus: \ + {err:#}" + ); + BTreeSet::new() + } + }; + let nexus_reachable = known_from_nexus.is_ok(); + + let conn = datastore.pool_connection_for_tests().await?; + + // Both queries are backed by partial indexes (`lookup_ereports_by_class` + // and `lookup_unmarked_ereports_by_class`) and do not full-table-scan; + // see explain tests in nexus-db-queries. + let totals: Vec<(Option, i64)> = + DataStore::ereport_class_totals_query() + .load_async(&*conn) + .await + .context("loading per-class totals")?; + let unmarkeds: Vec<(Option, i64)> = + DataStore::ereport_unmarked_class_totals_query() + .load_async(&*conn) + .await + .context("loading per-class unmarked counts")?; + + // Merge by class. Key: Option so NULL gets its own bucket. + #[derive(Default)] + struct ClassCounts { + total: i64, + unmarked: i64, + } + let mut by_class: BTreeMap, ClassCounts> = BTreeMap::new(); + for (class, total) in totals { + by_class.entry(class).or_default().total = total; + } + for (class, unmarked) in unmarkeds { + by_class.entry(class).or_default().unmarked = unmarked; + } + + // Whether the deployed Nexus has a diagnosis engine that consumes a + // given ereport class. + #[derive(PartialEq, Eq)] + enum KnownToNexus { + /// Class has rows in the DB AND is in the list returned by Nexus. + Yes, + /// Class has rows in the DB but is NOT in the list returned by Nexus. + No, + /// Class is NULL — strict-match policy means the loader never + /// surfaces these to FM analysis. + NullClass, + /// Could not reach Nexus — known/unknown is undetermined. + Unknown, + } + impl std::fmt::Display for KnownToNexus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + Self::Yes => "yes", + Self::No => "no", + Self::NullClass => "-", + Self::Unknown => "?", + }) + } + } + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct ClassRow<'a> { + known: KnownToNexus, + total: i64, + unmarked: i64, + /// Variable-length, so it goes last: wrapping on a narrow terminal + /// won't disrupt the fixed-width numeric columns. + class: &'a str, + } + + let mut rows: Vec> = by_class + .iter() + .map(|(class, ClassCounts { total, unmarked })| { + let (known_marker, class_str): (KnownToNexus, &str) = match class { + None => (KnownToNexus::NullClass, "(NULL)"), + Some(c) if !nexus_reachable => { + (KnownToNexus::Unknown, c.as_str()) + } + Some(c) => { + let k = if known.contains(c.as_str()) { + KnownToNexus::Yes + } else { + KnownToNexus::No + }; + (k, c.as_str()) + } + }; + ClassRow { + known: known_marker, + total: *total, + unmarked: *unmarked, + class: class_str, + } + }) + .collect(); + + // Sort: unknown-but-present first (highest unmarked), then known, then + // undetermined, then NULL. + rows.sort_by(|a, b| { + let priority = |row: &ClassRow<'_>| match row.known { + KnownToNexus::No => 0, + KnownToNexus::Yes => 1, + KnownToNexus::Unknown => 2, + KnownToNexus::NullClass => 3, + }; + priority(a) + .cmp(&priority(b)) + .then_with(|| b.unmarked.cmp(&a.unmarked)) + .then_with(|| a.class.cmp(b.class)) + }); + + if nexus_reachable { + println!( + "note: KNOWN reflects which classes the currently-deployed Nexus \ + knows how\nto consume.\n" + ); + } else { + println!( + "note: could not reach Nexus to determine known ereport classes.\n" + ); + } + + let mut table = tabled::Table::new(&rows); + table + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)); + println!("{table}"); + + // Footer: classes Nexus knows about but with no rows in the database. + if nexus_reachable { + let seen_known: BTreeSet<&str> = rows + .iter() + .filter(|r| r.known == KnownToNexus::Yes) + .map(|r| r.class) + .collect(); + let absent: Vec<&String> = + known.iter().filter(|c| !seen_known.contains(c.as_str())).collect(); + if !absent.is_empty() { + println!( + "\nClasses Nexus knows about but with no rows in the database:" + ); + for c in absent { + println!(" {c}"); + } + } + } + + Ok(()) +} + +async fn fetch_known_classes_from_nexus( + omdb: &Omdb, + log: &slog::Logger, + args: &ClassesArgs, +) -> anyhow::Result> { + let nexus_url = match &args.nexus_internal_url { + Some(url) => url.clone(), + None => { + let addr = omdb + .dns_lookup_one(log.clone(), ServiceName::NexusLockstep) + .await + .context("resolving Nexus lockstep service via internal DNS")?; + format!("http://{addr}") + } + }; + let client = nexus_lockstep_client::Client::new(&nexus_url, log.clone()); + let resp = client + .fm_known_ereport_classes_list() + .await + .context("calling Nexus fm_known_ereport_classes_list")?; + Ok(resp.into_inner()) +} diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 2bec4896d33..7ae9c1b2001 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -3495,22 +3495,36 @@ fn print_task_fm_analysis(details: &serde_json::Value) { AnalysisOutcome, AnalysisStatus, Outcome, PreparationStatus, }; - let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } = - match serde_json::from_value::(details.clone()) { - Err(error) => { - eprintln!( - "warning: failed to interpret task details: {:?}: {:?}", - error, details - ); - return; - } - Ok(status) => status, - }; + let FmAnalysisStatus { + parent_sitrep_id, + inv_collection_id, + known_classes, + outcome, + } = match serde_json::from_value::(details.clone()) { + Err(error) => { + eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ); + return; + } + Ok(status) => status, + }; pub const PARENT_SITREP_ID: &str = "parent sitrep ID:"; pub const INV_ID: &str = "current inventory collection ID:"; - pub const WIDTH: usize = const_max_len(&[PARENT_SITREP_ID, INV_ID]) + 1; + pub const KNOWN_CLASSES: &str = "ereport classes consumed:"; + pub const WIDTH: usize = + const_max_len(&[PARENT_SITREP_ID, INV_ID, KNOWN_CLASSES]) + 1; println!(" {PARENT_SITREP_ID: (s ago) and ran for ms parent sitrep ID: None current inventory collection ID: Some(..................... (collection)) + ereport classes consumed: (none) FAULT MANAGEMENT ANALYSIS SUMMARY ================================= /!\ analysis failed: FM analysis is not yet implemented @@ -1378,6 +1379,7 @@ task: "fm_analysis" started at (s ago) and ran for ms parent sitrep ID: None current inventory collection ID: Some(..................... (collection)) + ereport classes consumed: (none) FAULT MANAGEMENT ANALYSIS SUMMARY ================================= /!\ analysis failed: FM analysis is not yet implemented diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 75f60aa8185..10f37aaa99e 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -533,6 +533,8 @@ Commands: list List ereports info Show an ereport reporters List ereport reporters + classes Summarize ereports by class, marking which classes a diagnosis engine in Nexus consumes + (fetched from Nexus's lockstep API; falls back to `?` if Nexus is unreachable) help Print this message or the help of the given subcommand(s) Options: diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index e3104d5a2a0..c98e0178f67 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(254, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(255, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(255, "lookup-unmarked-ereports-by-class"), KnownVersion::new( 254, "drop-uninitialized-svc-enabled-not-online-state", diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 54aa6f6418d..166eaf47467 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -320,8 +320,9 @@ impl DataStore { } /// Lists ereports which have not been marked as **definitely seen** - /// (included in a committed sitrep) in the database, paginated by the - /// reporter restart ID and ENA. + /// (included in a committed sitrep) in the database, restricted to + /// ereports whose `class` is one of `classes`, paginated by the reporter + /// restart ID and ENA. /// /// Note that this filters based only on whether they have been marked in /// the database. Because marking seen ereports occurs asynchronously from @@ -329,22 +330,76 @@ impl DataStore { /// query may have already been seen. These ereports must be filtered out at /// a higher level based on the contents of the current sitrep when /// determining which ereports are *actually* new. + /// + /// Ereports with `class IS NULL` are intentionally never returned: the + /// SQL filter is `class = ANY($1::text[])`, which never matches NULL. + /// Callers (e.g. fm_analysis preparation) deliberately key off + /// `nexus_fm::diagnosis::known_ereport_classes` so that the loader + /// only surfaces ereports that FM analysis can consume; see that + /// function's documentation for the policy and rationale. pub async fn ereports_list_unmarked( &self, opctx: &OpContext, + classes: &[&str], pagparams: &DataPageParams<'_, (Uuid, DbEna)>, ) -> ListResultVec { // TODO(eliza): ereports should probably have their own resource type someday... opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - Self::ereports_list_unmarked_query(pagparams) + // An empty class set means no diagnosis engine consumes any ereport + // class. There's no value in paging through CRDB to find out + // there's nothing to load. + if classes.is_empty() { + return Ok(Vec::new()); + } + Self::ereports_list_unmarked_query(classes, pagparams) .load_async(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Diesel query that counts ereports per class across the entire (non + /// soft-deleted) ereport table. + /// + /// Backed by the `lookup_ereports_by_class` partial index (predicate + /// `time_deleted IS NULL`). The query is a *full index scan* over that + /// partial index — bounded by the predicate — but never a full table + /// scan of the primary key. The `explain_ereport_class_totals_query` + /// test asserts this. + pub fn ereport_class_totals_query() + -> impl RunnableQuery<(Option, i64)> + use<> { + use diesel::dsl::count_star; + dsl::ereport + .group_by(dsl::class) + .filter(dsl::time_deleted.is_null()) + .select((dsl::class, count_star())) + } + + /// Diesel query that counts ereports per class, restricted to ereports + /// that have not yet been marked as seen in any committed sitrep. + /// + /// Backed by the `lookup_unmarked_ereports_by_class` partial index + /// (predicate `marked_seen_in IS NULL AND time_deleted IS NULL`). + /// Bounded by the partial-index predicate; never a full table scan. + /// The `explain_ereport_unmarked_class_totals_query` test asserts this. + pub fn ereport_unmarked_class_totals_query() + -> impl RunnableQuery<(Option, i64)> + use<> { + use diesel::dsl::count_star; + dsl::ereport + .group_by(dsl::class) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::marked_seen_in.is_null()) + .select((dsl::class, count_star())) + } + fn ereports_list_unmarked_query( + classes: &[&str], pagparams: &DataPageParams<'_, (Uuid, DbEna)>, ) -> impl RunnableQuery + use<> { + // NULL-class ereports are intentionally excluded: `class = ANY(...)` + // never matches NULL. See `nexus_fm::diagnosis::known_ereport_classes` + // for the policy. + let classes: Vec = + classes.iter().map(|c| (*c).to_string()).collect(); paginated_multicolumn( dsl::ereport, (dsl::restart_id, dsl::ena), @@ -352,6 +407,7 @@ impl DataStore { ) .filter(dsl::marked_seen_in.is_null()) .filter(dsl::time_deleted.is_null()) + .filter(dsl::class.eq_any(classes)) .select(Ereport::as_select()) } @@ -434,6 +490,7 @@ mod tests { use super::*; use crate::db::explain::ExplainableAsync; use crate::db::pub_test_utils::TestDatabase; + use crate::db::pub_test_utils::explain::assert_uses_partial_index_only; use crate::db::raw_query_builder::expectorate_query_contents; use dropshot::PaginationOrder; use omicron_test_utils::dev; @@ -687,7 +744,12 @@ mod tests { direction: PaginationOrder::Ascending, limit: NonZeroU32::new(100).unwrap(), }; - let query = DataStore::ereports_list_unmarked_query(&pagparams); + let classes: &[&str] = &[ + "ereport.positronic-brain.example", + "ereport.spinning-blades.example", + ]; + let query = + DataStore::ereports_list_unmarked_query(classes, &pagparams); expectorate_query_contents( &query, "tests/output/ereports_list_unmarked.sql", @@ -708,17 +770,65 @@ mod tests { direction: PaginationOrder::Ascending, limit: NonZeroU32::new(100).unwrap(), }; - let query = DataStore::ereports_list_unmarked_query(&pagparams); + let classes: &[&str] = &[ + "ereport.positronic-brain.example", + "ereport.spinning-blades.example", + ]; + let query = + DataStore::ereports_list_unmarked_query(classes, &pagparams); let explanation = query .explain_async(&conn) .await .expect("Failed to explain query - is it valid SQL?"); - eprintln!("{explanation}"); - assert!( - !explanation.contains("FULL SCAN"), - "Found an unexpected FULL SCAN: {}", - explanation + assert_uses_partial_index_only( + &explanation, + "lookup_unmarked_ereports_by_class", + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn explain_ereport_class_totals_query() { + let logctx = dev::test_setup_log("explain_ereport_class_totals_query"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = DataStore::ereport_class_totals_query(); + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + assert_uses_partial_index_only( + &explanation, + "lookup_ereports_by_class", + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn explain_ereport_unmarked_class_totals_query() { + let logctx = + dev::test_setup_log("explain_ereport_unmarked_class_totals_query"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = DataStore::ereport_unmarked_class_totals_query(); + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + assert_uses_partial_index_only( + &explanation, + "lookup_unmarked_ereports_by_class", ); db.terminate().await; @@ -837,4 +947,95 @@ mod tests { db.terminate().await; logctx.cleanup_successful(); } + + /// `ereports_list_unmarked` filters by class. Verifies: + /// - Only ereports with `class IN (filter)` are returned. + /// - Ereports with NULL class are never returned, even when no class + /// filter is specifically targeting them. + /// - An empty `classes` slice returns empty without a DB roundtrip. + #[tokio::test] + async fn test_ereports_list_unmarked_class_filter() { + let logctx = + dev::test_setup_log("test_ereports_list_unmarked_class_filter"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Insert three ereports under one reporter: classes "alpha", "beta", + // and NULL. + let restart_id = EreporterRestartUuid::new_v4(); + let collector_id = OmicronZoneUuid::new_v4(); + let make = |ena: u64, class: Option<&str>| fm::EreportData { + id: fm::EreportId { restart_id, ena: ereport_types::Ena(ena) }, + time_collected: Utc::now(), + collector_id, + part_number: Some("CPN".to_string()), + serial_number: Some("SN".to_string()), + class: class.map(str::to_string), + report: serde_json::json!({}), + }; + datastore + .ereports_insert( + &opctx, + fm::Reporter::Sp { + sp_type: nexus_types::inventory::SpType::Sled, + slot: 0, + }, + vec![ + make(1, Some("alpha")), + make(2, Some("beta")), + make(3, None), + ], + ) + .await + .expect("insert should succeed"); + + let pagparams = DataPageParams { + marker: None, + direction: PaginationOrder::Ascending, + limit: NonZeroU32::new(100).unwrap(), + }; + + // Filter for "alpha" only — should return only ENA 1. + let alpha_only = datastore + .ereports_list_unmarked(opctx, &["alpha"], &pagparams) + .await + .expect("alpha-only query should succeed"); + let enas: Vec = alpha_only.iter().map(|e| e.ena.0.0).collect(); + assert_eq!(enas, vec![1], "alpha-only should match only ENA 1"); + + // Filter for "alpha" + "beta" — should return ENAs 1 and 2 but + // NEVER the NULL-class ereport (ENA 3). + let alpha_beta = datastore + .ereports_list_unmarked(opctx, &["alpha", "beta"], &pagparams) + .await + .expect("alpha+beta query should succeed"); + let mut enas: Vec = alpha_beta.iter().map(|e| e.ena.0.0).collect(); + enas.sort(); + assert_eq!( + enas, + vec![1, 2], + "alpha+beta should match ENAs 1 and 2; NULL-class ENA 3 must \ + be excluded by the class filter" + ); + + // Filter for a class that doesn't exist — should return empty. + let nope = datastore + .ereports_list_unmarked(opctx, &["nonexistent.class"], &pagparams) + .await + .expect("nonexistent-class query should succeed"); + assert!(nope.is_empty(), "nonexistent class should match nothing"); + + // Empty classes — should return empty without a DB roundtrip. + let empty = datastore + .ereports_list_unmarked(opctx, &[], &pagparams) + .await + .expect("empty-classes query should succeed"); + assert!( + empty.is_empty(), + "empty classes list must short-circuit to no results" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/pub_test_utils/explain.rs b/nexus/db-queries/src/db/pub_test_utils/explain.rs new file mode 100644 index 00000000000..ae2da895f3e --- /dev/null +++ b/nexus/db-queries/src/db/pub_test_utils/explain.rs @@ -0,0 +1,41 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Test assertions for inspecting CockroachDB `EXPLAIN` output. + +/// Asserts that an EXPLAIN plan: +/// 1. Names the expected partial index in its `table:` line. +/// 2. Does not contain any `FULL SCAN` over a non-partial index. A +/// `FULL SCAN` over a partial index is acceptable (the partial +/// predicate already bounds the rows touched), but a `FULL SCAN` +/// over the primary key or a non-partial secondary index means +/// we walked the whole table. +#[track_caller] +pub fn assert_uses_partial_index_only(explanation: &str, expected_index: &str) { + eprintln!("{explanation}"); + + let mut last_table_was_partial = false; + let mut found_expected = false; + let mut bad_full_scan = false; + for line in explanation.lines() { + let line = line.trim(); + if line.starts_with("table:") { + last_table_was_partial = line.contains("(partial index)"); + if line.contains(&format!("@{expected_index}")) { + found_expected = true; + } + } else if line.contains("FULL SCAN") && !last_table_was_partial { + bad_full_scan = true; + } + } + + assert!( + !bad_full_scan, + "Found a FULL SCAN over a non-partial index in plan:\n{explanation}" + ); + assert!( + found_expected, + "Expected plan to use index `{expected_index}`, but plan was:\n{explanation}" + ); +} diff --git a/nexus/db-queries/src/db/pub_test_utils/mod.rs b/nexus/db-queries/src/db/pub_test_utils/mod.rs index be7ef037c8f..6c55b52ed62 100644 --- a/nexus/db-queries/src/db/pub_test_utils/mod.rs +++ b/nexus/db-queries/src/db/pub_test_utils/mod.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use uuid::Uuid; pub mod crdb; +pub mod explain; pub mod helpers; pub mod multicast; diff --git a/nexus/db-queries/tests/output/ereports_list_unmarked.sql b/nexus/db-queries/tests/output/ereports_list_unmarked.sql index a8b23b78d7a..d6b54fef136 100644 --- a/nexus/db-queries/tests/output/ereports_list_unmarked.sql +++ b/nexus/db-queries/tests/output/ereports_list_unmarked.sql @@ -16,8 +16,8 @@ SELECT FROM ereport WHERE - (ereport.marked_seen_in IS NULL) AND (ereport.time_deleted IS NULL) + ((ereport.marked_seen_in IS NULL) AND (ereport.time_deleted IS NULL)) AND ereport.class = ANY ($1) ORDER BY ereport.restart_id ASC, ereport.ena ASC LIMIT - $1 + $2 diff --git a/nexus/fm/src/diagnosis.rs b/nexus/fm/src/diagnosis.rs index a1bb65b31e2..96ea491f1b1 100644 --- a/nexus/fm/src/diagnosis.rs +++ b/nexus/fm/src/diagnosis.rs @@ -11,3 +11,27 @@ pub fn analyze( ) -> anyhow::Result<()> { anyhow::bail!("FM analysis is not yet implemented") } + +/// Ereport classes that the diagnosis engine currently understands. +/// Preparation only surfaces ereports whose class is in this set — there is +/// no value in loading ereports FM analysis cannot consume. +/// +/// Empty until [`analyze`] gains real handling. Grow this alongside FM +/// analysis as new classes gain support. +/// +/// **NULL-class ereports are intentionally excluded by the loader's SQL +/// filter** (`class = ANY(...)` never matches NULL). If FM analysis ever +/// needs to handle the "couldn't extract a class" or "reporter doesn't know +/// its identity" cases, that's an explicit decision (e.g. a sentinel +/// loader path), not a default of this list. +/// +/// # Scaling +/// +/// The loader filters ereports via `WHERE class = ANY($1::text[])` against +/// the existing `lookup_ereports_by_class` index. This is comfortable up to +/// a few hundred entries; past ~1000 entries, prefer either prefix matching +/// (`class LIKE 'ereport.cpu.amd.%'`) or a `known_ereport_class` lookup +/// table joined into the query. Revisit this if the list grows that large. +pub fn known_ereport_classes() -> &'static [&'static str] { + &[] +} diff --git a/nexus/lockstep-api/src/lib.rs b/nexus/lockstep-api/src/lib.rs index abb9a0d8255..80138d6fee4 100644 --- a/nexus/lockstep-api/src/lib.rs +++ b/nexus/lockstep-api/src/lib.rs @@ -587,6 +587,17 @@ pub trait NexusLockstepApi { rqctx: RequestContext, path_params: Path, ) -> Result, HttpError>; + + // Fault management + + /// List ereport classes that this Nexus's diagnosis engines consume. + #[endpoint { + method = GET, + path = "/fm/known-ereport-classes", + }] + async fn fm_known_ereport_classes_list( + rqctx: RequestContext, + ) -> Result>, HttpError>; } /// Path parameters for Rack requests. diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index 78491c4d97d..8885eaf55c1 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -78,6 +78,14 @@ impl FmAnalysis { &mut self, opctx: &OpContext, ) -> FmAnalysisStatus { + // Snapshot the static known-classes set once, up front, so it's + // reported in the activation status regardless of which outcome + // variant fires. + let known_classes: Vec = fm::diagnosis::known_ereport_classes() + .iter() + .map(|s| (*s).to_string()) + .collect(); + let parent_sitrep = self.sitrep_rx.borrow_and_update().clone(); let parent_sitrep_id = parent_sitrep.as_ref().map(|s| s.1.id()); let Some(inv) = self.inv_rx.borrow_and_update().clone() else { @@ -88,6 +96,7 @@ impl FmAnalysis { return FmAnalysisStatus { parent_sitrep_id, inv_collection_id: None, + known_classes, outcome: status::Outcome::WaitingForInventory, }; }; @@ -123,6 +132,7 @@ impl FmAnalysis { return FmAnalysisStatus { parent_sitrep_id, inv_collection_id: Some(inv_collection_id), + known_classes, outcome: status::Outcome::PreparationError( error.to_string(), ), @@ -147,6 +157,7 @@ impl FmAnalysis { return FmAnalysisStatus { parent_sitrep_id, inv_collection_id: Some(inv_collection_id), + known_classes, outcome: status::Outcome::WaitingForNewerInventory { parent_inv_id, next_inv_min_time_started, @@ -162,6 +173,7 @@ impl FmAnalysis { FmAnalysisStatus { parent_sitrep_id, inv_collection_id: Some(inv_collection_id), + known_classes, outcome: status::Outcome::RanAnalysis { prep_status, analysis_status: outcome, @@ -195,6 +207,8 @@ impl FmAnalysis { builder: &mut fm::analysis_input::Builder, errors: &mut Vec, ) -> anyhow::Result<()> { + // Only surface ereports a diagnosis engine will consume. + let classes = fm::diagnosis::known_ereport_classes(); let mut paginator = Paginator::new( nexus_db_queries::db::datastore::SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, @@ -203,7 +217,7 @@ impl FmAnalysis { let prev_total = builder.num_ereports(); let batch = self .datastore - .ereports_list_unmarked(opctx, &p.current_pagparams()) + .ereports_list_unmarked(opctx, classes, &p.current_pagparams()) .await?; paginator = p.found_batch(&batch, &|e| { (e.restart_id.into_untyped_uuid(), e.ena) diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index 87cf70eb823..ef76c1db310 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -472,7 +472,6 @@ mod tests { use nexus_types::fm::ereport::EreportData; use nexus_types::fm::ereport::Reporter; use nexus_types::support_bundle::BundleDataSelection; - use omicron_common::api::external::DataPageParams; use omicron_test_utils::dev; use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseEreportUuid; @@ -483,7 +482,6 @@ mod tests { use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SitrepUuid; use omicron_uuid_kinds::SupportBundleUuid; - use std::num::NonZeroU32; /// Activators needed by `FmRendezvous`, pre-wired for testing. struct TestActivators { @@ -732,33 +730,23 @@ mod tests { } /// Paginate through all unseen ereports, collecting every row. + /// + /// This bypasses the production `ereports_list_unmarked` (which filters + /// by `known_ereport_classes`) because the test wants to verify that + /// rendezvous marked *every* ereport in a sitrep, regardless of class. async fn list_all_unseen_ereports( datastore: &DataStore, - opctx: &OpContext, ) -> Vec { - let batch_size = NonZeroU32::new(100).unwrap(); - let mut all = Vec::new(); - let mut marker = None; - loop { - let page = datastore - .ereports_list_unmarked( - opctx, - &DataPageParams { - marker: marker.as_ref(), - direction: dropshot::PaginationOrder::Ascending, - limit: batch_size, - }, - ) - .await - .expect("failed to list unseen ereports"); - if page.is_empty() { - break; - } - let last = page.last().unwrap(); - marker = Some((last.restart_id.into_untyped_uuid(), last.ena)); - all.extend(page); - } - all + use nexus_db_schema::schema::ereport::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + dsl::ereport + .filter(dsl::marked_seen_in.is_null()) + .filter(dsl::time_deleted.is_null()) + .order_by((dsl::restart_id, dsl::ena)) + .select(db::model::Ereport::as_select()) + .load_async(&*conn) + .await + .expect("failed to list unseen ereports") } /// Asserts that each of the provided ereport IDs has been marked as @@ -770,7 +758,7 @@ mod tests { ids: &[ereport_types::EreportId], sitrep_id: SitrepUuid, ) { - let unseen = list_all_unseen_ereports(datastore, opctx).await; + let unseen = list_all_unseen_ereports(datastore).await; for &id in ids { let ereport = datastore.ereport_fetch(opctx, id).await.unwrap_or_else(|e| { @@ -800,7 +788,7 @@ mod tests { opctx: &OpContext, ids: &[ereport_types::EreportId], ) { - let unseen = list_all_unseen_ereports(datastore, opctx).await; + let unseen = list_all_unseen_ereports(datastore).await; for &id in ids { let ereport = datastore.ereport_fetch(opctx, id).await.unwrap_or_else(|e| { diff --git a/nexus/src/lockstep_api/http_entrypoints.rs b/nexus/src/lockstep_api/http_entrypoints.rs index 0d03eb7bd90..6e62aa36a3d 100644 --- a/nexus/src/lockstep_api/http_entrypoints.rs +++ b/nexus/src/lockstep_api/http_entrypoints.rs @@ -1145,4 +1145,21 @@ impl NexusLockstepApi for NexusLockstepApiImpl { .instrument_dropshot_handler(&rqctx, handler) .await } + + async fn fm_known_ereport_classes_list( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let apictx = &rqctx.context().context; + let handler = async { + let classes = nexus_fm::diagnosis::known_ereport_classes() + .iter() + .map(|s| (*s).to_string()) + .collect(); + Ok(HttpResponseOk(classes)) + }; + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index cbf09135bde..c386492dcdf 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -916,6 +916,13 @@ pub struct SitrepGcStatus { pub struct FmAnalysisStatus { pub parent_sitrep_id: Option, pub inv_collection_id: Option, + /// Ereport classes that *this* Nexus's diagnosis engine consumes + /// (per `nexus_fm::diagnosis::known_ereport_classes`). Recorded here so + /// an operator interpreting the activation outcome can see what the + /// loader was configured to surface — e.g. whether `RanAnalysis` + /// produced no new ereports because the set is empty vs. because + /// nothing matched. + pub known_classes: Vec, pub outcome: fm_analysis::Outcome, } diff --git a/openapi/nexus-lockstep.json b/openapi/nexus-lockstep.json index b20bf509b47..c5e1680686b 100644 --- a/openapi/nexus-lockstep.json +++ b/openapi/nexus-lockstep.json @@ -982,6 +982,34 @@ } } }, + "/fm/known-ereport-classes": { + "get": { + "summary": "List ereport classes that this Nexus's diagnosis engines consume.", + "operationId": "fm_known_ereport_classes_list", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_String", + "type": "array", + "items": { + "type": "string" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/instances/{instance_id}/migrate": { "post": { "operationId": "instance_migrate", diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f1ec866f290..096e8d301e0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7379,6 +7379,17 @@ ON omicron.public.ereport ( WHERE time_deleted IS NULL; +-- Targeted partial index supporting fm_analysis preparation: filter by class +-- (FM analysis's `known_ereport_classes`) restricted to ereports that are +-- still unprocessed, ordered by the (restart_id, ena) pagination key. +CREATE INDEX IF NOT EXISTS lookup_unmarked_ereports_by_class +ON omicron.public.ereport ( + class, restart_id, ena +) +WHERE + marked_seen_in IS NULL + AND time_deleted IS NULL; + CREATE INDEX IF NOT EXISTS lookup_unseen_ereports ON omicron.public.ereport ( restart_id, ena @@ -8474,7 +8485,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '254.0.0', NULL) + (TRUE, NOW(), NOW(), '255.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/lookup-unmarked-ereports-by-class/up.sql b/schema/crdb/lookup-unmarked-ereports-by-class/up.sql new file mode 100644 index 00000000000..f58995fbc71 --- /dev/null +++ b/schema/crdb/lookup-unmarked-ereports-by-class/up.sql @@ -0,0 +1,7 @@ +CREATE INDEX IF NOT EXISTS lookup_unmarked_ereports_by_class +ON omicron.public.ereport ( + class, restart_id, ena +) +WHERE + marked_seen_in IS NULL + AND time_deleted IS NULL; diff --git a/schema/crdb/lookup-unmarked-ereports-by-class/up.verify.sql b/schema/crdb/lookup-unmarked-ereports-by-class/up.verify.sql new file mode 100644 index 00000000000..0cebdac86f1 --- /dev/null +++ b/schema/crdb/lookup-unmarked-ereports-by-class/up.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'ereport' AND index_name = 'lookup_unmarked_ereports_by_class')),'true','Schema change verification failed: index lookup_unmarked_ereports_by_class on table ereport does not exist') AS BOOL);