Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1622,7 +1622,7 @@ impl DbArgs {
).await
},
DbCommands::Ereport(args) => {
cmd_db_ereport(&datastore, &fetch_opts, &args).await
cmd_db_ereport(omdb, log, &datastore, &fetch_opts, &args).await
}
DbCommands::UserDataExport(args) => {
args.exec(&omdb, &opctx, &datastore).await
Expand Down
220 changes: 220 additions & 0 deletions dev-tools/omdb/src/bin/omdb/db/ereport.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

use super::DbFetchOptions;
use super::check_limit;
use crate::Omdb;
use crate::helpers::CONNECTION_OPTIONS_HEADING;
use crate::helpers::const_max_len;
use crate::helpers::datetime_opt_rfc3339_concise;
use crate::helpers::datetime_rfc3339_concise;
Expand All @@ -23,6 +25,7 @@ use clap::Subcommand;
use diesel::AggregateExpressionMethods;
use diesel::dsl::{count, min};
use diesel::prelude::*;
use internal_dns_types::names::ServiceName;
use nexus_db_lookup::DbConnection;
use nexus_db_model::ereport as model;
use nexus_db_model::ereport::DbEna;
Expand Down Expand Up @@ -55,6 +58,23 @@ enum Commands {

/// List ereport reporters
Reporters(ReportersArgs),

/// Summarize ereports by class, marking which classes a diagnosis engine
/// in Nexus consumes (fetched from Nexus's lockstep API; falls back to
/// `?` if Nexus is unreachable).
Classes(ClassesArgs),
}

#[derive(Debug, Args, Clone)]
struct ClassesArgs {
/// URL of the Nexus lockstep API. If not provided, looks up an instance
/// in internal DNS.
#[clap(
long,
env = "OMDB_NEXUS_URL",
help_heading = CONNECTION_OPTIONS_HEADING,
)]
nexus_internal_url: Option<String>,
}

#[derive(Debug, Args, Clone)]
Expand Down Expand Up @@ -100,6 +120,8 @@ struct ReportersArgs {
}

pub(super) async fn cmd_db_ereport(
omdb: &Omdb,
log: &slog::Logger,
datastore: &DataStore,
fetch_opts: &DbFetchOptions,
args: &EreportArgs,
Expand All @@ -115,6 +137,10 @@ pub(super) async fn cmd_db_ereport(
Commands::Reporters(ref args) => {
cmd_db_ereporters(datastore, args).await
}

Commands::Classes(ref args) => {
cmd_db_ereport_classes(omdb, log, datastore, args).await
}
}
}

Expand Down Expand Up @@ -466,3 +492,197 @@ async fn cmd_db_ereporters(

Ok(())
}

async fn cmd_db_ereport_classes(
omdb: &Omdb,
log: &slog::Logger,
datastore: &DataStore,
args: &ClassesArgs,
) -> anyhow::Result<()> {
use std::collections::BTreeMap;
use std::collections::BTreeSet;

// Try to fetch the known list from Nexus. If anything fails, fall back
// to "?" for every row — DB totals are still useful even without Nexus.
let known_from_nexus =
fetch_known_classes_from_nexus(omdb, log, args).await;
let known: BTreeSet<String> = match &known_from_nexus {
Ok(list) => list.iter().cloned().collect(),
Err(err) => {
eprintln!(
"warning: could not fetch known ereport classes from Nexus: \
{err:#}"
);
BTreeSet::new()
}
};
let nexus_reachable = known_from_nexus.is_ok();

let conn = datastore.pool_connection_for_tests().await?;

// Both queries are backed by partial indexes (`lookup_ereports_by_class`
// and `lookup_unmarked_ereports_by_class`) and do not full-table-scan;
// see explain tests in nexus-db-queries.
let totals: Vec<(Option<String>, i64)> =
DataStore::ereport_class_totals_query()
.load_async(&*conn)
.await
.context("loading per-class totals")?;
let unmarkeds: Vec<(Option<String>, i64)> =
DataStore::ereport_unmarked_class_totals_query()
.load_async(&*conn)
.await
.context("loading per-class unmarked counts")?;

// Merge by class. Key: Option<String> so NULL gets its own bucket.
#[derive(Default)]
struct ClassCounts {
total: i64,
unmarked: i64,
}
let mut by_class: BTreeMap<Option<String>, ClassCounts> = BTreeMap::new();
for (class, total) in totals {
by_class.entry(class).or_default().total = total;
}
for (class, unmarked) in unmarkeds {
by_class.entry(class).or_default().unmarked = unmarked;
}

// Whether the deployed Nexus has a diagnosis engine that consumes a
// given ereport class.
#[derive(PartialEq, Eq)]
enum KnownToNexus {
/// Class has rows in the DB AND is in the list returned by Nexus.
Yes,
/// Class has rows in the DB but is NOT in the list returned by Nexus.
No,
/// Class is NULL — strict-match policy means the loader never
/// surfaces these to FM analysis.
NullClass,
/// Could not reach Nexus — known/unknown is undetermined.
Unknown,
}
impl std::fmt::Display for KnownToNexus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
Self::Yes => "yes",
Self::No => "no",
Self::NullClass => "-",
Self::Unknown => "?",
})
}
}

#[derive(Tabled)]
#[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
struct ClassRow<'a> {
known: KnownToNexus,
total: i64,
unmarked: i64,
/// Variable-length, so it goes last: wrapping on a narrow terminal
/// won't disrupt the fixed-width numeric columns.
class: &'a str,
}

let mut rows: Vec<ClassRow<'_>> = by_class
.iter()
.map(|(class, ClassCounts { total, unmarked })| {
let (known_marker, class_str): (KnownToNexus, &str) = match class {
None => (KnownToNexus::NullClass, "(NULL)"),
Some(c) if !nexus_reachable => {
(KnownToNexus::Unknown, c.as_str())
}
Some(c) => {
let k = if known.contains(c.as_str()) {
KnownToNexus::Yes
} else {
KnownToNexus::No
};
(k, c.as_str())
}
};
ClassRow {
known: known_marker,
total: *total,
unmarked: *unmarked,
class: class_str,
}
})
.collect();

// Sort: unknown-but-present first (highest unmarked), then known, then
// undetermined, then NULL.
rows.sort_by(|a, b| {
let priority = |row: &ClassRow<'_>| match row.known {
KnownToNexus::No => 0,
KnownToNexus::Yes => 1,
KnownToNexus::Unknown => 2,
KnownToNexus::NullClass => 3,
};
priority(a)
.cmp(&priority(b))
.then_with(|| b.unmarked.cmp(&a.unmarked))
.then_with(|| a.class.cmp(b.class))
});

if nexus_reachable {
println!(
"note: KNOWN reflects which classes the currently-deployed Nexus \
knows how\nto consume.\n"
);
} else {
println!(
"note: could not reach Nexus to determine known ereport classes.\n"
);
}

let mut table = tabled::Table::new(&rows);
table
.with(tabled::settings::Style::empty())
.with(tabled::settings::Padding::new(0, 1, 0, 0));
println!("{table}");

// Footer: classes Nexus knows about but with no rows in the database.
if nexus_reachable {
let seen_known: BTreeSet<&str> = rows
.iter()
.filter(|r| r.known == KnownToNexus::Yes)
.map(|r| r.class)
.collect();
let absent: Vec<&String> =
known.iter().filter(|c| !seen_known.contains(c.as_str())).collect();
if !absent.is_empty() {
println!(
"\nClasses Nexus knows about but with no rows in the database:"
);
for c in absent {
println!(" {c}");
}
}
}

Ok(())
}

async fn fetch_known_classes_from_nexus(
omdb: &Omdb,
log: &slog::Logger,
args: &ClassesArgs,
) -> anyhow::Result<Vec<String>> {
let nexus_url = match &args.nexus_internal_url {
Some(url) => url.clone(),
None => {
let addr = omdb
.dns_lookup_one(log.clone(), ServiceName::NexusLockstep)
.await
.context("resolving Nexus lockstep service via internal DNS")?;
format!("http://{addr}")
}
};
let client = nexus_lockstep_client::Client::new(&nexus_url, log.clone());
let resp = client
.fm_known_ereport_classes_list()
.await
.context("calling Nexus fm_known_ereport_classes_list")?;
Ok(resp.into_inner())
}
38 changes: 26 additions & 12 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3495,22 +3495,36 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
AnalysisOutcome, AnalysisStatus, Outcome, PreparationStatus,
};

let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } =
match serde_json::from_value::<FmAnalysisStatus>(details.clone()) {
Err(error) => {
eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
);
return;
}
Ok(status) => status,
};
let FmAnalysisStatus {
parent_sitrep_id,
inv_collection_id,
known_classes,
outcome,
} = match serde_json::from_value::<FmAnalysisStatus>(details.clone()) {
Err(error) => {
eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
);
return;
}
Ok(status) => status,
};
pub const PARENT_SITREP_ID: &str = "parent sitrep ID:";
pub const INV_ID: &str = "current inventory collection ID:";
pub const WIDTH: usize = const_max_len(&[PARENT_SITREP_ID, INV_ID]) + 1;
pub const KNOWN_CLASSES: &str = "ereport classes consumed:";
pub const WIDTH: usize =
const_max_len(&[PARENT_SITREP_ID, INV_ID, KNOWN_CLASSES]) + 1;
println!(" {PARENT_SITREP_ID:<WIDTH$}{parent_sitrep_id:?}");
println!(" {INV_ID:<WIDTH$}{inv_collection_id:?}");
if known_classes.is_empty() {
println!(" {KNOWN_CLASSES:<WIDTH$}(none)");
} else {
println!(" {KNOWN_CLASSES:<WIDTH$}({} total)", known_classes.len());
for class in &known_classes {
println!(" - {class}");
}
}
println!(" FAULT MANAGEMENT ANALYSIS SUMMARY");
println!(" =================================");
let (prep_status, analysis_status) = match outcome {
Expand Down
2 changes: 2 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,7 @@ task: "fm_analysis"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
ereport classes consumed: (none)
FAULT MANAGEMENT ANALYSIS SUMMARY
=================================
/!\ analysis failed: FM analysis is not yet implemented
Expand Down Expand Up @@ -1378,6 +1379,7 @@ task: "fm_analysis"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
ereport classes consumed: (none)
FAULT MANAGEMENT ANALYSIS SUMMARY
=================================
/!\ analysis failed: FM analysis is not yet implemented
Expand Down
2 changes: 2 additions & 0 deletions dev-tools/omdb/tests/usage_errors.out
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,8 @@ Commands:
list List ereports
info Show an ereport
reporters List ereport reporters
classes Summarize ereports by class, marking which classes a diagnosis engine in Nexus consumes
(fetched from Nexus's lockstep API; falls back to `?` if Nexus is unreachable)
help Print this message or the help of the given subcommand(s)

Options:
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(254, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(255, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(255, "lookup-unmarked-ereports-by-class"),
KnownVersion::new(
254,
"drop-uninitialized-svc-enabled-not-online-state",
Expand Down
Loading
Loading