diff --git a/Cargo.lock b/Cargo.lock index 7a133d5af03..47ac15b35b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8875,6 +8875,7 @@ dependencies = [ "nexus-db-schema", "nexus-inventory", "nexus-lockstep-client", + "nexus-networking", "nexus-reconfigurator-preparation", "nexus-saga-recovery", "nexus-test-utils", @@ -8907,6 +8908,7 @@ dependencies = [ "steno", "strum 0.27.2", "subprocess", + "support-bundle-collection", "support-bundle-viewer", "supports-color 3.0.2", "tabled 0.15.0", @@ -8919,6 +8921,7 @@ dependencies = [ "url", "uuid", "vergen-gitcl", + "zip 4.6.1", ] [[package]] diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 27de5c27917..3ecacb60934 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -19,6 +19,7 @@ base64.workspace = true bootstrap-agent-lockstep-client.workspace = true bytes.workspace = true camino.workspace = true +camino-tempfile.workspace = true chrono.workspace = true clap.workspace = true clickhouse-admin-single-client.workspace = true @@ -54,6 +55,7 @@ nexus-db-queries.workspace = true nexus-db-schema.workspace = true nexus-inventory.workspace = true nexus-lockstep-client.workspace = true +nexus-networking.workspace = true nexus-reconfigurator-preparation.workspace = true nexus-saga-recovery.workspace = true nexus-types.workspace = true @@ -83,6 +85,7 @@ slog.workspace = true slog-error-chain.workspace = true steno.workspace = true strum.workspace = true +support-bundle-collection.workspace = true support-bundle-viewer.workspace = true supports-color.workspace = true tabled.workspace = true @@ -104,6 +107,7 @@ nexus-test-utils-macros.workspace = true omicron-nexus.workspace = true omicron-test-utils.workspace = true subprocess.workspace = true +zip.workspace = true # Disable doc builds by default for our binaries to work around issue # rust-lang/cargo#8373. These docs would not be very useful anyway. diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs index 8d3524e96a7..ca8df0783ee 100644 --- a/dev-tools/omdb/src/bin/omdb/main.rs +++ b/dev-tools/omdb/src/bin/omdb/main.rs @@ -58,6 +58,7 @@ mod oxql; mod reconfigurator; mod sled_agent; mod support_bundle; +mod support_bundle_collect; fn main() -> Result<(), anyhow::Error> { sigpipe::reset(); @@ -83,6 +84,7 @@ async fn main_impl() -> Result<(), anyhow::Error> { reconfig.run_cmd(&args, &log).await } OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await, + OmdbCommands::SupportBundle(sb) => sb.run_cmd(&args, &log).await, OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await, OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await, OmdbCommands::ClickhouseAdmin(ch) => ch.run_cmd(&args, &log).await, @@ -297,6 +299,8 @@ enum OmdbCommands { Reconfigurator(reconfigurator::ReconfiguratorArgs), /// Debug a specific Sled SledAgent(sled_agent::SledAgentArgs), + /// Collect or inspect a support bundle + SupportBundle(support_bundle_collect::SupportBundleArgs), } fn parse_dropshot_log_level( diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs new file mode 100644 index 00000000000..0a7a0af2d9a --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs @@ -0,0 +1,221 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! `omdb support-bundle collect` — collect a support bundle locally, +//! without going through Nexus. +//! +//! Unlike the Nexus background task, this path: +//! +//! - Does not register a row in the `support_bundle` table. +//! - Does not transfer the resulting bundle to a sled-agent for durable +//! storage. The zip is written to a local file path. +//! - Does not require Nexus to be up. It only needs CRDB, internal +//! DNS, MGS, and the rack's sled-agents reachable on the underlay. +//! +//! This is intended for incident response, where the operator may need +//! to collect a bundle precisely because Nexus is unhealthy. + +use crate::Omdb; +use crate::db::DbUrlOptions; +use anyhow::Context; +use camino::Utf8PathBuf; +use camino_tempfile::tempdir_in; +use clap::Args; +use clap::Subcommand; +use clap::ValueEnum; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::ereport::EreportFilters; +use nexus_types::support_bundle::BundleDataSelection; +use omicron_uuid_kinds::SupportBundleUuid; +use std::io::Seek; +use std::io::SeekFrom; +use std::sync::Arc; +use support_bundle_collection::BundleCollection; +use support_bundle_collection::BundleInfo; +use support_bundle_collection::zip::bundle_to_zipfile; + +/// Categories of data the bundle collector knows how to gather. +/// +/// Mirrors `nexus_types::support_bundle::BundleDataCategory`, but is +/// declared here so it can derive `clap::ValueEnum` without making +/// `nexus-types` depend on clap. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, ValueEnum)] +enum BundleCategory { + Reconfigurator, + HostInfo, + SledCubbyInfo, + SpDumps, + Ereports, +} + +/// Arguments to the "omdb support-bundle" subcommand +#[derive(Debug, Args)] +pub struct SupportBundleArgs { + #[command(subcommand)] + command: SupportBundleCommands, +} + +#[derive(Debug, Subcommand)] +enum SupportBundleCommands { + /// Collect a support bundle without involving Nexus. + /// + /// Connects directly to CockroachDB, internal DNS, MGS, and the + /// rack's sled-agents — none of which depend on Nexus being up. + /// The bundle is written to a local zip file. No row is created + /// in the `support_bundle` table. + Collect(CollectArgs), +} + +#[derive(Debug, Args)] +struct CollectArgs { + #[command(flatten)] + db_url_opts: DbUrlOptions, + + /// Path where the resulting bundle zip will be written. + #[clap(long, short = 'o')] + output: Utf8PathBuf, + + /// Reason recorded inside the bundle's metadata. + #[clap(long, default_value = "collected via omdb")] + reason: String, + + /// Directory to use for staging the bundle contents before zipping. + #[clap(long, default_value = "/var/tmp")] + tempdir: Utf8PathBuf, + + /// Categories of data to collect. May be supplied multiple times. + /// Defaults to all categories. + #[clap(long, value_enum)] + include: Vec, +} + +impl CollectArgs { + fn data_selection(&self) -> BundleDataSelection { + let categories: &[BundleCategory] = if self.include.is_empty() { + BundleCategory::value_variants() + } else { + self.include.as_slice() + }; + + let mut sel = BundleDataSelection::new(); + for category in categories { + sel = match category { + BundleCategory::Reconfigurator => sel.with_reconfigurator(), + BundleCategory::HostInfo => sel.with_all_sleds(), + BundleCategory::SledCubbyInfo => sel.with_sled_cubby_info(), + BundleCategory::SpDumps => sel.with_sp_dumps(), + BundleCategory::Ereports => sel.with_ereports( + EreportFilters::new() + .with_start_time( + omicron_common::now_db_precision() + - chrono::Days::new(7), + ) + .expect("no end time set, cannot fail"), + ), + }; + } + sel + } +} + +impl SupportBundleArgs { + pub async fn run_cmd( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> anyhow::Result<()> { + match &self.command { + SupportBundleCommands::Collect(args) => args.run(omdb, log).await, + } + } +} + +impl CollectArgs { + async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> { + self.db_url_opts + .with_datastore(omdb, log, async |opctx, datastore| { + self.collect(omdb, log, opctx, datastore).await + }) + .await + } + + async fn collect( + &self, + omdb: &Omdb, + log: &slog::Logger, + opctx: OpContext, + datastore: Arc, + ) -> anyhow::Result<()> { + let resolver = omdb.dns_resolver(log.clone()).await?; + + let bundle = BundleInfo { + id: SupportBundleUuid::new_v4(), + reason_for_creation: self.reason.clone(), + }; + let bundle_log = log.new(slog::o!("bundle" => bundle.id.to_string())); + eprintln!("Collecting support bundle {}", bundle.id); + + let collection = Arc::new(BundleCollection::new( + datastore, + resolver, + bundle_log, + opctx, + self.data_selection(), + bundle, + )); + + // Wire Ctrl-C to cancel the in-flight collection. + let cancel_handle = tokio::spawn({ + let token = collection.cancellation_token().clone(); + async move { + let _ = tokio::signal::ctrl_c().await; + eprintln!("\nCtrl-C received — cancelling bundle collection."); + token.cancel(); + } + }); + + let dir = tempdir_in(&self.tempdir).with_context(|| { + format!("creating temp dir under {}", self.tempdir) + })?; + let collect_result = collection.collect_bundle_locally(&dir).await; + cancel_handle.abort(); + let _ = cancel_handle.await; + let report = collect_result?; + + let zip_tempdir = self.tempdir.clone(); + let output = self.output.clone(); + tokio::task::spawn_blocking(move || -> anyhow::Result<()> { + let mut tempfile = bundle_to_zipfile(&dir, &zip_tempdir)?; + tempfile.seek(SeekFrom::Start(0))?; + let mut out = std::fs::File::create(&output) + .with_context(|| format!("creating {output}"))?; + std::io::copy(&mut tempfile, &mut out)?; + Ok(()) + }) + .await + .context("zip task panicked")??; + + eprintln!("Wrote bundle to {}", self.output); + eprintln!("{} steps executed:", report.steps.len()); + for step in &report.steps { + let dur = step.end - step.start; + eprintln!( + " {:>9}ms {:?} {}", + dur.num_milliseconds(), + step.status, + step.name, + ); + } + if let Some(ereports) = &report.ereports { + eprintln!( + "ereports: {} found, {} collected, {} errors", + ereports.n_found, + ereports.n_collected, + ereports.errors.len(), + ); + } + Ok(()) + } +} diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index b787dceefd9..0dd1b55843f 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -436,6 +436,51 @@ async fn test_omdb_success_cases() { ); assert!(!parsed.collections.is_empty()); + // Exercise `omdb support-bundle collect` end-to-end. We don't add this + // to the `successes.out` snapshot because the output includes a + // randomly-generated bundle UUID, timing-dependent step durations, + // and per-sled step names that would all need redaction. Instead we + // run the command and verify the resulting zip is well-formed and + // contains the expected metadata files. + let bundle_path = tmpdir.path().join("bundle.zip"); + let bundle_args: &[&str] = &[ + "support-bundle", + "collect", + "--output", + bundle_path.as_str(), + "--tempdir", + tmpdir.path().as_str(), + "--reason", + "integration test", + ]; + let mut bundle_output = String::new(); + let p = postgres_url.clone(); + let dns = cptestctx.internal_dns.dns_server.local_address().to_string(); + do_run_no_redactions( + &mut bundle_output, + move |exec| exec.env("OMDB_DB_URL", &p).env("OMDB_DNS_SERVER", &dns), + &cmd_path, + bundle_args, + ) + .await; + let zip_file = std::fs::File::open(&bundle_path).unwrap_or_else(|err| { + panic!( + "bundle zip not produced at {bundle_path}: {}\n\ + omdb output was:\n{bundle_output}", + InlineErrorChain::new(&err), + ) + }); + let mut archive = + zip::ZipArchive::new(zip_file).expect("bundle is a valid zip archive"); + for required in + ["bundle_id.txt", "meta/reason_for_creation.txt", "meta/trace.json"] + { + assert!( + archive.by_name(required).is_ok(), + "bundle zip is missing expected entry {required}", + ); + } + let ox_invocation = &["oximeter", "list-producers"]; let mut ox_output = String::new(); let ox = ox_url.clone(); diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index b940a1f991f..db57bc2686c 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -19,6 +19,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle help Print this message or the help of the given subcommand(s) Options: @@ -54,6 +55,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle help Print this message or the help of the given subcommand(s) Options: