diff --git a/Cargo.lock b/Cargo.lock index 73663321..ecf0474c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -610,10 +610,12 @@ version = "0.1.0" dependencies = [ "dirs", "futures", + "nix", "reqwest", "serde", "serde_json", "tokio", + "wait-timeout", ] [[package]] diff --git a/crates/doctor/Cargo.toml b/crates/doctor/Cargo.toml index a4b2600f..537f8595 100644 --- a/crates/doctor/Cargo.toml +++ b/crates/doctor/Cargo.toml @@ -11,3 +11,7 @@ tokio = { version = "1.0", features = ["rt", "macros"] } futures = "0.3" reqwest = { version = "0.13", features = ["blocking", "json"] } dirs = "6" +wait-timeout = "0.2" + +[target.'cfg(unix)'.dependencies] +nix = { version = "0.31", features = ["signal"] } diff --git a/crates/doctor/src/agents.rs b/crates/doctor/src/agents.rs index d7d85ce9..cac3adc6 100644 --- a/crates/doctor/src/agents.rs +++ b/crates/doctor/src/agents.rs @@ -1,10 +1,13 @@ //! AI agent checks and fix command lookup. use std::path::{Path, PathBuf}; -use std::process::Command; use crate::checks::CLONEFILE_FIX_COMMAND; +use crate::command::{ + run_command_with_timeout, CommandError, CommandTimeout, DEFAULT_PROBE_TIMEOUT, +}; use crate::resolve::format_command_output; +use crate::timeout_check::{command_timeout_check, TimeoutCheck}; use crate::types::{ AgentVersionInfo, AuthStatus, CheckStatus, DoctorCheck, FixType, InstallSource, ResolvedBinary, }; @@ -303,7 +306,9 @@ pub fn check_single_ai_agent( if let Some(ref path_str) = resolved_path { if info.id == "ai-agent-goose" { - match Command::new(path_str).arg("acp").arg("--help").output() { + let mut command = std::process::Command::new(path_str); + command.arg("acp").arg("--help"); + match run_command_with_timeout(command, "goose acp --help", DEFAULT_PROBE_TIMEOUT) { Ok(output) if output.status.success() => { let raw = format!( "{header}\n{}\n{}", @@ -363,6 +368,20 @@ pub fn check_single_ai_agent( bridge: None, } } + Err(CommandError::Timeout { command, timeout }) => command_timeout_check( + TimeoutCheck::new( + info.id, + info.label, + CheckStatus::Fail, + &header, + command, + timeout, + ) + .path(resolved_path) + .install_source(bridge_install_source.clone()) + .main(version_readout(bridge_install_source.clone())) + .raw_suffix(Some(&search)), + ), Err(e) => DoctorCheck { id: info.id.to_string(), label: info.label.to_string(), @@ -431,6 +450,10 @@ pub fn check_single_ai_agent( Some(AuthStatus::Unknown), Some(format!("$ {cmd}\n(spawn failure: {e})")), ), + ExecOutcome::Timeout { command, timeout } => { + let timeout = CommandTimeout::new(info.label, command, timeout); + (Some(AuthStatus::Unknown), Some(timeout.raw_output())) + } // Exit 127 means the shell couldn't find the command — // PATH-shadowed or uninstalled. Not the same as "signed // out"; report Unknown so the UI doesn't offer a useless diff --git a/crates/doctor/src/checks.rs b/crates/doctor/src/checks.rs index 897f942a..01157450 100644 --- a/crates/doctor/src/checks.rs +++ b/crates/doctor/src/checks.rs @@ -2,7 +2,9 @@ use std::process::Command; +use crate::command::{run_command_with_timeout, CommandError, DEFAULT_PROBE_TIMEOUT}; use crate::resolve::format_command_output; +use crate::timeout_check::{command_timeout_check, TimeoutCheck}; use crate::types::{CheckStatus, DoctorCheck, FixType, ResolvedBinary}; /// Fix command for enabling copy-on-write git clones. @@ -42,7 +44,9 @@ pub fn check_git(resolved: &ResolvedBinary) -> DoctorCheck { }; let path_str = git_path.to_string_lossy().to_string(); - match Command::new(git_path).arg("--version").output() { + let mut command = Command::new(git_path); + command.arg("--version"); + match run_command_with_timeout(command, "git --version", DEFAULT_PROBE_TIMEOUT) { Ok(output) if output.status.success() => { let version = String::from_utf8_lossy(&output.stdout).trim().to_string(); let raw = format!( @@ -98,6 +102,12 @@ pub fn check_git(resolved: &ResolvedBinary) -> DoctorCheck { bridge: None, } } + Err(CommandError::Timeout { command, timeout }) => command_timeout_check( + TimeoutCheck::new(id, label, CheckStatus::Fail, header, command, timeout) + .path(Some(path_str)) + .install_source(resolved.install_source.clone()) + .raw_suffix(Some(search)), + ), Err(e) => DoctorCheck { id, label, @@ -155,7 +165,9 @@ pub fn check_gh(resolved: &ResolvedBinary) -> DoctorCheck { }; let path_str = gh_path.to_string_lossy().to_string(); - match Command::new(gh_path).arg("--version").output() { + let mut command = Command::new(gh_path); + command.arg("--version"); + match run_command_with_timeout(command, "gh --version", DEFAULT_PROBE_TIMEOUT) { Ok(output) if output.status.success() => { let version = String::from_utf8_lossy(&output.stdout); let first_line = version.lines().next().unwrap_or("gh").trim().to_string(); @@ -212,6 +224,12 @@ pub fn check_gh(resolved: &ResolvedBinary) -> DoctorCheck { bridge: None, } } + Err(CommandError::Timeout { command, timeout }) => command_timeout_check( + TimeoutCheck::new(id, label, CheckStatus::Fail, header, command, timeout) + .path(Some(path_str)) + .install_source(resolved.install_source.clone()) + .raw_suffix(Some(search)), + ), Err(e) => DoctorCheck { id, label, @@ -267,7 +285,9 @@ pub fn check_gh_auth(gh: &ResolvedBinary) -> DoctorCheck { } }; - match Command::new(gh_path).args(["auth", "status"]).output() { + let mut command = Command::new(gh_path); + command.args(["auth", "status"]); + match run_command_with_timeout(command, "gh auth status", DEFAULT_PROBE_TIMEOUT) { Ok(output) => { let raw = format!( "{header}\n{}", @@ -324,6 +344,9 @@ pub fn check_gh_auth(gh: &ResolvedBinary) -> DoctorCheck { } } } + Err(CommandError::Timeout { command, timeout }) => command_timeout_check( + TimeoutCheck::new(id, label, CheckStatus::Fail, header, command, timeout), + ), Err(e) => DoctorCheck { id, label, @@ -383,7 +406,9 @@ pub fn check_git_lfs(git: &ResolvedBinary, git_lfs: &ResolvedBinary) -> DoctorCh } }; - match Command::new(git_path).args(["lfs", "version"]).output() { + let mut command = Command::new(git_path); + command.args(["lfs", "version"]); + match run_command_with_timeout(command, "git lfs version", DEFAULT_PROBE_TIMEOUT) { Ok(output) if output.status.success() => { let version = String::from_utf8_lossy(&output.stdout).trim().to_string(); let path = git_lfs @@ -443,6 +468,17 @@ pub fn check_git_lfs(git: &ResolvedBinary, git_lfs: &ResolvedBinary) -> DoctorCh bridge: None, } } + Err(CommandError::Timeout { command, timeout }) => command_timeout_check( + TimeoutCheck::new(id, label, CheckStatus::Warn, header, command, timeout) + .path( + git_lfs + .path + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + ) + .install_source(git_lfs.install_source.clone()) + .raw_suffix(Some(search)), + ), Err(e) => DoctorCheck { id, label, @@ -499,10 +535,13 @@ pub fn check_clonefile(git: &ResolvedBinary) -> DoctorCheck { } }; - match Command::new(git_path) - .args(["config", "--global", "core.clonefile"]) - .output() - { + let mut command = Command::new(git_path); + command.args(["config", "--global", "core.clonefile"]); + match run_command_with_timeout( + command, + "git config --global core.clonefile", + DEFAULT_PROBE_TIMEOUT, + ) { Ok(output) if output.status.success() => { let raw = format!( "{header}\n{}", @@ -583,6 +622,10 @@ pub fn check_clonefile(git: &ResolvedBinary) -> DoctorCheck { bridge: None, } } + Err(CommandError::Timeout { command, timeout }) => command_timeout_check( + TimeoutCheck::new(id, label, CheckStatus::Warn, header, command, timeout) + .install_source(git.install_source.clone()), + ), Err(e) => DoctorCheck { id, label, diff --git a/crates/doctor/src/command.rs b/crates/doctor/src/command.rs new file mode 100644 index 00000000..e7a9e5fe --- /dev/null +++ b/crates/doctor/src/command.rs @@ -0,0 +1,253 @@ +//! Bounded subprocess helpers for doctor probes. +//! +//! The doctor crate runs many small local probes. None of them should be able +//! to pin a blocking worker forever, so probe call sites route through this +//! module instead of `Command::output()`. On timeout or wait failure, the +//! runner kills and reaps the direct child; on Unix it first targets the +//! child's process group. Descendants that deliberately detach from that group +//! can survive best-effort cleanup and keep pipe reader threads alive until +//! their inherited stdout/stderr handles close, but the caller still returns. + +use std::fmt; +use std::io::Read; +use std::process::{Child, Command, Output, Stdio}; +use std::thread::{self, JoinHandle}; +use std::time::Duration; + +use wait_timeout::ChildExt; + +pub(crate) const DEFAULT_PROBE_TIMEOUT: Duration = Duration::from_secs(10); +pub(crate) const FRESHNESS_PROBE_TIMEOUT: Duration = Duration::from_secs(15); + +#[derive(Debug)] +pub(crate) enum CommandError { + Spawn { + command: String, + source: std::io::Error, + }, + Wait { + command: String, + source: std::io::Error, + }, + Timeout { + command: String, + timeout: Duration, + }, +} + +impl fmt::Display for CommandError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CommandError::Spawn { command, source } => { + write!(f, "failed to spawn `{command}`: {source}") + } + CommandError::Wait { command, source } => { + write!(f, "failed to wait for `{command}`: {source}") + } + CommandError::Timeout { command, timeout } => { + write!( + f, + "`{command}` timed out after {}", + format_duration(*timeout) + ) + } + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct CommandTimeout { + pub label: String, + pub command: String, + pub timeout: Duration, +} + +impl CommandTimeout { + pub(crate) fn new( + label: impl Into, + command: impl Into, + timeout: Duration, + ) -> Self { + Self { + label: label.into(), + command: command.into(), + timeout, + } + } + + pub(crate) fn message(&self) -> String { + format!( + "{} timed out after {} running {}", + self.label, + format_duration(self.timeout), + self.command + ) + } + + pub(crate) fn raw_output(&self) -> String { + format!( + "$ {}\ntimed out after {}", + self.command, + format_duration(self.timeout) + ) + } +} + +pub(crate) fn format_duration(duration: Duration) -> String { + if duration.subsec_nanos() == 0 { + format!("{}s", duration.as_secs()) + } else { + format!("{duration:?}") + } +} + +pub(crate) fn run_command_with_timeout( + mut command: Command, + display_command: impl Into, + timeout: Duration, +) -> Result { + let display_command = display_command.into(); + command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + #[cfg(unix)] + { + use std::os::unix::process::CommandExt; + command.process_group(0); + } + + let mut child = command.spawn().map_err(|source| CommandError::Spawn { + command: display_command.clone(), + source, + })?; + + let stdout = child.stdout.take(); + let stderr = child.stderr.take(); + let stdout_thread = stdout.map(|mut stdout| { + thread::spawn(move || { + let mut bytes = Vec::new(); + let _ = stdout.read_to_end(&mut bytes); + bytes + }) + }); + let stderr_thread = stderr.map(|mut stderr| { + thread::spawn(move || { + let mut bytes = Vec::new(); + let _ = stderr.read_to_end(&mut bytes); + bytes + }) + }); + + let status = match child.wait_timeout(timeout) { + Ok(Some(status)) => status, + Ok(None) => { + clean_up_after_incomplete_wait(&mut child); + return Err(CommandError::Timeout { + command: display_command, + timeout, + }); + } + Err(source) => { + clean_up_after_incomplete_wait(&mut child); + return Err(CommandError::Wait { + command: display_command, + source, + }); + } + }; + + let stdout = stdout_thread.map(join_reader).unwrap_or_default(); + let stderr = stderr_thread.map(join_reader).unwrap_or_default(); + + Ok(Output { + status, + stdout, + stderr, + }) +} + +fn join_reader(handle: JoinHandle>) -> Vec { + handle.join().unwrap_or_default() +} + +fn clean_up_after_incomplete_wait(child: &mut Child) { + kill_child_process_group_or_child(child); + let _ = child.wait(); +} + +fn kill_child_process_group_or_child(child: &mut Child) { + if kill_child_process_group(child) { + return; + } + + let _ = child.kill(); +} + +#[cfg(unix)] +fn kill_child_process_group(child: &Child) -> bool { + let Ok(pid) = i32::try_from(child.id()) else { + return false; + }; + + nix::sys::signal::kill( + nix::unistd::Pid::from_raw(-pid), + nix::sys::signal::Signal::SIGKILL, + ) + .is_ok() +} + +#[cfg(not(unix))] +fn kill_child_process_group(_child: &Child) -> bool { + false +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn command_runner_captures_large_output() { + let mut command = Command::new("sh"); + command + .arg("-c") + .arg("yes | head -c 200000; printf stderr >&2"); + + let output = + run_command_with_timeout(command, "sh -c large output", Duration::from_secs(2)) + .unwrap(); + + assert!(output.status.success()); + assert_eq!(output.stdout.len(), 200_000); + assert_eq!(String::from_utf8_lossy(&output.stderr), "stderr"); + } + + #[cfg(unix)] + #[test] + fn command_runner_returns_when_escaped_descendant_keeps_pipes_open() { + let mut command = Command::new("sh"); + command + .arg("-c") + .arg("perl -MPOSIX=setsid -e 'setsid(); sleep 2' & wait"); + let timeout = Duration::from_millis(250); + let started = std::time::Instant::now(); + let err = + run_command_with_timeout(command, "sh -c escaped descendant", timeout).unwrap_err(); + + match err { + CommandError::Timeout { + command, + timeout: actual, + } => { + assert_eq!(command, "sh -c escaped descendant"); + assert_eq!(actual, timeout); + } + other => panic!("expected timeout, got {other:?}"), + } + assert!( + started.elapsed() < Duration::from_secs(1), + "timeout path waited for escaped descendant pipe EOF" + ); + } +} diff --git a/crates/doctor/src/freshness.rs b/crates/doctor/src/freshness.rs index 5c548f9d..f2480837 100644 --- a/crates/doctor/src/freshness.rs +++ b/crates/doctor/src/freshness.rs @@ -13,6 +13,9 @@ use std::time::{SystemTime, UNIX_EPOCH}; use serde::{Deserialize, Serialize}; use serde_json::Value; +use crate::command::{ + run_command_with_timeout, CommandError, CommandTimeout, FRESHNESS_PROBE_TIMEOUT, +}; use crate::package_ids::LatestSource; use crate::types::InstallSource; @@ -27,6 +30,13 @@ pub(crate) struct VersionInfo { pub installed: Option, pub latest: Option, pub update_available: Option, + pub command_timeouts: Vec, +} + +#[derive(Debug, Default)] +struct ProbeResult { + value: Option, + timeouts: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -296,14 +306,31 @@ fn installed_version_from_package_json( /// Run ` ` and parse the first semver-shaped token out of the /// combined stdout/stderr. Errors and missing tokens both yield `None`. -fn installed_version(binary_path: &Path, version_args: &[&str]) -> Option { - let output = Command::new(binary_path).args(version_args).output().ok()?; +fn installed_version(binary_path: &Path, version_args: &[&str]) -> ProbeResult { + let mut command = Command::new(binary_path); + command.args(version_args); + let display_command = format!("{} {}", binary_path.display(), version_args.join(" ")) + .trim() + .to_string(); + let output = match run_command_with_timeout(command, display_command, FRESHNESS_PROBE_TIMEOUT) { + Ok(output) => output, + Err(CommandError::Timeout { command, timeout }) => { + return ProbeResult { + value: None, + timeouts: vec![CommandTimeout::new("installed version", command, timeout)], + }; + } + Err(_) => return ProbeResult::default(), + }; let combined = format!( "{}\n{}", String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr), ); - extract_version(&combined) + ProbeResult { + value: extract_version(&combined), + timeouts: Vec::new(), + } } /// Whether a tool keeps itself up to date and therefore should never raise an @@ -321,24 +348,42 @@ fn latest_version( source: LatestSource, package_id: &str, npm_registry: Option<&str>, -) -> Option { +) -> ProbeResult { match source { LatestSource::Brew => latest_brew(package_id), LatestSource::Npm => latest_npm(package_id, npm_registry), - LatestSource::CratesIo => latest_crates_io(package_id), - LatestSource::GitHubReleases => latest_github_releases(package_id), + LatestSource::CratesIo => ProbeResult { + value: latest_crates_io(package_id), + timeouts: Vec::new(), + }, + LatestSource::GitHubReleases => ProbeResult { + value: latest_github_releases(package_id), + timeouts: Vec::new(), + }, } } -fn latest_brew(package_id: &str) -> Option { - let output = Command::new("brew") - .args(["info", "--json=v2", package_id]) - .output() - .ok()?; +fn latest_brew(package_id: &str) -> ProbeResult { + let mut command = Command::new("brew"); + command.args(["info", "--json=v2", package_id]); + let display_command = format!("brew info --json=v2 {package_id}"); + let output = match run_command_with_timeout(command, display_command, FRESHNESS_PROBE_TIMEOUT) { + Ok(output) => output, + Err(CommandError::Timeout { command, timeout }) => { + return ProbeResult { + value: None, + timeouts: vec![CommandTimeout::new("brew latest version", command, timeout)], + }; + } + Err(_) => return ProbeResult::default(), + }; if !output.status.success() { - return None; + return ProbeResult::default(); + } + ProbeResult { + value: parse_brew_info_v2(&output.stdout, package_id), + timeouts: Vec::new(), } - parse_brew_info_v2(&output.stdout, package_id) } /// Pull `formulae[0].versions.stable` or `casks[0].version` out of a @@ -367,21 +412,38 @@ pub(crate) fn parse_brew_info_v2(bytes: &[u8], _package_id: &str) -> Option) -> Option { +fn latest_npm(package_id: &str, npm_registry: Option<&str>) -> ProbeResult { let mut cmd = Command::new("npm"); cmd.args(["view", package_id, "version"]); if let Some(registry) = npm_registry { cmd.args(["--registry", registry]); } - let output = cmd.output().ok()?; + let display_command = if let Some(registry) = npm_registry { + format!("npm view {package_id} version --registry {registry}") + } else { + format!("npm view {package_id} version") + }; + let output = match run_command_with_timeout(cmd, display_command, FRESHNESS_PROBE_TIMEOUT) { + Ok(output) => output, + Err(CommandError::Timeout { command, timeout }) => { + return ProbeResult { + value: None, + timeouts: vec![CommandTimeout::new("npm latest version", command, timeout)], + }; + } + Err(_) => return ProbeResult::default(), + }; if !output.status.success() { - return None; + return ProbeResult::default(); } let raw = String::from_utf8_lossy(&output.stdout).trim().to_string(); if raw.is_empty() { - None + ProbeResult::default() } else { - Some(raw) + ProbeResult { + value: Some(raw), + timeouts: Vec::new(), + } } } @@ -470,14 +532,19 @@ pub(crate) async fn fetch_version_info( let npm_registry = npm_registry.map(|s| s.to_string()); let result = tokio::task::spawn_blocking(move || { + let mut command_timeouts = Vec::new(); let installed = if path.as_os_str().is_empty() { None } else { match &probe { - OwnedProbe::Cli(args) => installed_version( - &path, - &args.iter().map(|s| s.as_str()).collect::>(), - ), + OwnedProbe::Cli(args) => { + let result = installed_version( + &path, + &args.iter().map(|s| s.as_str()).collect::>(), + ); + command_timeouts.extend(result.timeouts); + result.value + } OwnedProbe::NpmPackageJson { package_id } => { installed_version_from_package_json(&path, package_id.as_deref()) } @@ -495,13 +562,17 @@ pub(crate) async fn fetch_version_info( }; if let Some(v) = cached { Some(v) - } else if let Some(v) = latest_version(source, &pkg, npm_registry.as_deref()) { - if let Ok(mut guard) = cache.lock() { - guard.insert(source, &pkg, v.clone(), now); - } - Some(v) } else { - None + let result = latest_version(source, &pkg, npm_registry.as_deref()); + command_timeouts.extend(result.timeouts); + if let Some(v) = result.value { + if let Ok(mut guard) = cache.lock() { + guard.insert(source, &pkg, v.clone(), now); + } + Some(v) + } else { + None + } } } else { None @@ -512,6 +583,7 @@ pub(crate) async fn fetch_version_info( installed, latest, update_available, + command_timeouts, } }) .await; diff --git a/crates/doctor/src/lib.rs b/crates/doctor/src/lib.rs index 30f9b519..917495c6 100644 --- a/crates/doctor/src/lib.rs +++ b/crates/doctor/src/lib.rs @@ -6,24 +6,29 @@ pub mod agents; pub mod checks; +mod command; pub(crate) mod freshness; pub(crate) mod package_ids; pub mod resolve; +mod timeout_check; pub mod types; pub use types::{AgentVersionInfo, CheckStatus, DoctorCheck, DoctorReport, FixType}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::PathBuf; use std::sync::{Arc, Mutex}; use agents::{check_single_ai_agent, derive_update_command, lookup_fix_command, AI_AGENT_CHECKS}; use checks::{check_clonefile, check_gh, check_gh_auth, check_git, check_git_lfs}; +use command::{ + format_duration, run_command_with_timeout, CommandError, CommandTimeout, DEFAULT_PROBE_TIMEOUT, +}; use freshness::{ fetch_version_info, is_self_updating, load_cache, save_cache, select_installed_probe, }; use package_ids::{lookup_package_id, LatestSource, Role}; -use resolve::resolve_binary; +use resolve::resolve_binary_with_diagnostics; use types::{InstallSource, ResolvedBinary}; /// Fallback check returned when a spawn_blocking task panics. @@ -105,13 +110,20 @@ async fn collect_base_report(npm_registry: Option<&str>) -> DoctorReport { let handles: Vec<_> = binary_names .iter() - .map(|&name| tokio::task::spawn_blocking(move || (name, resolve_binary(name)))) + .map(|&name| { + tokio::task::spawn_blocking(move || { + let (resolved, timeouts) = resolve_binary_with_diagnostics(name); + (name, resolved, timeouts) + }) + }) .collect(); let mut resolved: HashMap<&str, ResolvedBinary> = HashMap::new(); + let mut resolution_timeouts = Vec::new(); for handle in handles { - if let Ok((name, rb)) = handle.await { + if let Ok((name, rb, timeouts)) = handle.await { resolved.insert(name, rb); + resolution_timeouts.extend(timeouts); } } @@ -195,9 +207,115 @@ async fn collect_base_report(npm_registry: Option<&str>) -> DoctorReport { ); } + checks.extend(timeout_diagnostic_checks(resolution_timeouts)); + DoctorReport { checks } } +fn timeout_diagnostic_checks(timeouts: Vec) -> Vec { + let mut seen_timeouts = HashSet::new(); + let mut used_ids = HashSet::new(); + let mut checks = Vec::new(); + for timeout in timeouts { + let fingerprint = ( + timeout.label.clone(), + timeout.command.clone(), + timeout.timeout, + ); + if !seen_timeouts.insert(fingerprint) { + continue; + } + let id = unique_timeout_diagnostic_id(&timeout, &mut used_ids); + checks.push(timeout_diagnostic_check(timeout, id)); + } + checks +} + +fn unique_timeout_diagnostic_id( + timeout: &CommandTimeout, + used_ids: &mut HashSet, +) -> String { + let slug_source = format!("{} {}", timeout.label, timeout.command); + let base = format!("subprocess-timeout-{}", slug(&slug_source)); + if used_ids.insert(base.clone()) { + return base; + } + + let suffixed = format!("{base}-{}", stable_timeout_suffix(timeout)); + if used_ids.insert(suffixed.clone()) { + return suffixed; + } + + let mut index = 2; + loop { + let candidate = format!("{suffixed}-{index}"); + if used_ids.insert(candidate.clone()) { + return candidate; + } + index += 1; + } +} + +fn stable_timeout_suffix(timeout: &CommandTimeout) -> String { + const FNV_OFFSET: u64 = 0xcbf29ce484222325; + const FNV_PRIME: u64 = 0x00000100000001b3; + + fn update(mut hash: u64, bytes: &[u8]) -> u64 { + for byte in bytes { + hash ^= u64::from(*byte); + hash = hash.wrapping_mul(FNV_PRIME); + } + hash + } + + let mut hash = FNV_OFFSET; + hash = update(hash, timeout.label.as_bytes()); + hash = update(hash, &[0]); + hash = update(hash, timeout.command.as_bytes()); + hash = update(hash, &[0]); + hash = update(hash, timeout.timeout.as_nanos().to_string().as_bytes()); + format!("{hash:016x}") +} + +fn timeout_diagnostic_check(timeout: CommandTimeout, id: String) -> DoctorCheck { + DoctorCheck { + id, + label: timeout.label.clone(), + status: CheckStatus::Warn, + message: timeout.message(), + fix_url: None, + fix_command: None, + fix_type: None, + path: None, + bridge_path: None, + raw_output: Some(format!( + "# Check: {}\n{}", + timeout.label, + timeout.raw_output() + )), + auth_status: None, + installed_version: None, + latest_version: None, + update_available: None, + install_source: None, + self_updating: None, + main: None, + bridge: None, + } +} + +fn slug(value: &str) -> String { + let mut out = String::new(); + for ch in value.chars() { + if ch.is_ascii_alphanumeric() { + out.push(ch.to_ascii_lowercase()); + } else if !out.ends_with('-') { + out.push('-'); + } + } + out.trim_matches('-').to_string() +} + /// Which binary behind a check a freshness probe targets. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] enum ReadoutSlot { @@ -265,6 +383,44 @@ fn apply_freshness( } } +fn apply_freshness_timeouts(check: &mut DoctorCheck, timeouts: &[CommandTimeout]) { + if timeouts.is_empty() { + return; + } + + if check.status == CheckStatus::Pass { + check.status = CheckStatus::Warn; + } + + let first = &timeouts[0]; + let summary = if timeouts.len() == 1 { + format!( + "freshness timed out after {} running {}", + format_duration(first.timeout), + first.command + ) + } else { + format!( + "{} freshness probes timed out, first after {} running {}", + timeouts.len(), + format_duration(first.timeout), + first.command + ) + }; + check.message = format!("{}; {summary}", check.message); + + let mut raw = check.raw_output.take().unwrap_or_default(); + if !raw.is_empty() { + raw.push('\n'); + } + raw.push_str("# Freshness subprocess timeouts:"); + for timeout in timeouts { + raw.push('\n'); + raw.push_str(&timeout.raw_output()); + } + check.raw_output = Some(raw); +} + /// Post-hoc pass: for every check that has a usable binary path and a known /// package id, run the installed/latest version probes in parallel and update /// the corresponding fields on the report. The on-disk cache is read once at @@ -361,18 +517,20 @@ async fn populate_freshness( for check in &mut report.checks { let is_agent = check.main.is_some() || check.bridge.is_some(); if is_agent { - if let (Some(readout), Some((info, pkg))) = ( - check.main.as_mut(), - by_target.remove(&(check.id.clone(), ReadoutSlot::Main)), - ) { - apply_freshness(readout, &info, ReadoutSlot::Main, pkg.as_deref()); + let mut freshness_timeouts = Vec::new(); + if let Some((info, pkg)) = by_target.remove(&(check.id.clone(), ReadoutSlot::Main)) { + if let Some(readout) = check.main.as_mut() { + apply_freshness(readout, &info, ReadoutSlot::Main, pkg.as_deref()); + } + freshness_timeouts.extend(info.command_timeouts); } - if let (Some(readout), Some((info, pkg))) = ( - check.bridge.as_mut(), - by_target.remove(&(check.id.clone(), ReadoutSlot::Bridge)), - ) { - apply_freshness(readout, &info, ReadoutSlot::Bridge, pkg.as_deref()); + if let Some((info, pkg)) = by_target.remove(&(check.id.clone(), ReadoutSlot::Bridge)) { + if let Some(readout) = check.bridge.as_mut() { + apply_freshness(readout, &info, ReadoutSlot::Bridge, pkg.as_deref()); + } + freshness_timeouts.extend(info.command_timeouts); } + apply_freshness_timeouts(check, &freshness_timeouts); // Mirror the headline readout (bridge if present, else main) into the // flat fields for backward-compatible consumers. let headline = check.bridge.as_ref().or(check.main.as_ref()).map(|r| { @@ -391,6 +549,7 @@ async fn populate_freshness( } } else if let Some((info, _pkg)) = by_target.remove(&(check.id.clone(), ReadoutSlot::Flat)) { + apply_freshness_timeouts(check, &info.command_timeouts); check.installed_version = info.installed; check.latest_version = info.latest; let self_updating = is_self_updating(check.install_source.as_ref()); @@ -497,6 +656,9 @@ where // writes into `raw_output`. on_line(&format!("$ {command}")); + // Fixes are intentionally not routed through the bounded probe runner: + // these are user-triggered install/auth/update actions and can reasonably + // be interactive or long-running. run_command_streaming(command, on_line).await } @@ -571,6 +733,11 @@ pub(crate) enum ExecOutcome { /// The shell itself couldn't be spawned (or `wait()` failed). Rare; means /// we have no signal at all from the inner command. Spawn(std::io::Error), + /// The command ran longer than the bounded doctor probe timeout. + Timeout { + command: String, + timeout: std::time::Duration, + }, /// The shell ran, the inner command exited non-zero. Code is `Some(127)` /// when the shell reports "command not found". Exit { @@ -586,16 +753,15 @@ pub(crate) fn execute_command_with_path_prefix( command: &str, path_prefix: &[PathBuf], ) -> ExecOutcome { - let mut cmd = build_shell_command(command, path_prefix); - cmd.stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()); - let child = match cmd.spawn() { - Ok(c) => c, - Err(e) => return ExecOutcome::Spawn(e), - }; - let output = match child.wait_with_output() { - Ok(o) => o, - Err(e) => return ExecOutcome::Spawn(e), + let cmd = build_shell_command(command, path_prefix); + let output = match run_command_with_timeout(cmd, command, DEFAULT_PROBE_TIMEOUT) { + Ok(output) => output, + Err(CommandError::Spawn { source, .. } | CommandError::Wait { source, .. }) => { + return ExecOutcome::Spawn(source) + } + Err(CommandError::Timeout { command, timeout }) => { + return ExecOutcome::Timeout { command, timeout } + } }; if output.status.success() { ExecOutcome::Ok @@ -608,9 +774,11 @@ pub(crate) fn execute_command_with_path_prefix( } /// Spawn `command` through a login shell, stream stdout/stderr lines to -/// `on_line`, and return based on the process exit status. Stderr lines are -/// also accumulated so a non-zero exit can surface a useful error message -/// (matching the non-streaming behavior of the previous `execute_command`). +/// `on_line`, and return based on the process exit status. This path is +/// deliberately unbounded: fix commands are user-triggered install/auth/update +/// actions and may prompt or run package managers. Stderr lines are also +/// accumulated so a non-zero exit can surface a useful error message (matching +/// the non-streaming behavior of the previous `execute_command`). fn run_command_streaming_blocking(command: &str, mut on_line: F) -> Result<(), String> where F: FnMut(&str), @@ -686,6 +854,51 @@ mod tests { use super::*; use std::sync::{Arc, Mutex}; + use std::time::Duration; + + fn timeout(label: &str, command: &str) -> CommandTimeout { + CommandTimeout::new(label, command, Duration::from_secs(15)) + } + + #[test] + fn timeout_diagnostic_ids_are_collision_safe() { + let checks = timeout_diagnostic_checks(vec![ + timeout("A B", "c"), + timeout("A", "B C"), + timeout("A B", "c"), + ]); + + assert_eq!(checks.len(), 2, "exact duplicate timeout should be deduped"); + assert_eq!(checks[0].id, "subprocess-timeout-a-b-c"); + assert!( + checks[1].id.starts_with("subprocess-timeout-a-b-c-"), + "slug collision should get deterministic suffix: {:?}", + checks.iter().map(|c| &c.id).collect::>(), + ); + } + + #[test] + fn apply_freshness_timeouts_appends_one_combined_block() { + let mut check = DoctorCheck { + status: CheckStatus::Pass, + message: "Installed".into(), + raw_output: Some("base raw".into()), + ..empty_check("ai-agent-test", "Test Agent") + }; + let timeouts = [ + timeout("installed version", "agent --version"), + timeout("npm latest version", "npm view agent-acp version"), + ]; + + apply_freshness_timeouts(&mut check, &timeouts); + + assert_eq!(check.status, CheckStatus::Warn); + assert!(check.message.contains("2 freshness probes timed out")); + let raw = check.raw_output.unwrap(); + assert_eq!(raw.matches("# Freshness subprocess timeouts:").count(), 1); + assert!(raw.contains("$ agent --version")); + assert!(raw.contains("$ npm view agent-acp version")); + } /// Streaming helper must invoke `on_line` for each output line of a /// successful command. Lines from `.zshrc` etc. may also appear; we only @@ -803,6 +1016,7 @@ mod tests { installed: Some("0.1.0".into()), latest: Some("0.2.0".into()), update_available: Some(true), + command_timeouts: Vec::new(), }; apply_freshness( &mut readout, @@ -829,6 +1043,7 @@ mod tests { installed: Some("0.1.0".into()), latest: Some("0.2.0".into()), update_available: Some(true), + command_timeouts: Vec::new(), }; apply_freshness(&mut readout, &info, ReadoutSlot::Bridge, Some("amp")); assert_eq!(readout.update_command.as_deref(), Some("brew upgrade amp"),); @@ -848,6 +1063,7 @@ mod tests { installed: Some("1.0.0".into()), latest: Some("2.0.0".into()), update_available: Some(true), + command_timeouts: Vec::new(), }; apply_freshness( &mut readout, @@ -875,6 +1091,7 @@ mod tests { installed: Some("0.2.0".into()), latest: Some("0.2.0".into()), update_available: Some(false), + command_timeouts: Vec::new(), }; apply_freshness(&mut readout, &info, ReadoutSlot::Main, Some("amp-acp")); assert!(readout.update_command.is_none()); diff --git a/crates/doctor/src/resolve.rs b/crates/doctor/src/resolve.rs index d0e1d368..f162969a 100644 --- a/crates/doctor/src/resolve.rs +++ b/crates/doctor/src/resolve.rs @@ -3,17 +3,29 @@ use std::path::{Path, PathBuf}; use std::process::Command; +use crate::command::{ + format_duration, run_command_with_timeout, CommandError, CommandTimeout, DEFAULT_PROBE_TIMEOUT, +}; + use super::types::{InstallSource, ResolvedBinary}; /// Resolve a binary by trying login shell path lookup, common install paths, /// then npm global install dirs. pub fn resolve_binary(cmd: &str) -> ResolvedBinary { + resolve_binary_with_diagnostics(cmd).0 +} + +pub(crate) fn resolve_binary_with_diagnostics(cmd: &str) -> (ResolvedBinary, Vec) { let mut lines = vec![format!("resolve '{cmd}':")]; + let mut timeouts = Vec::new(); // Strategy 1: Login shell path lookup (primary) lines.push(" strategy 1 — login shell path lookup:".to_string()); for (shell, lookup_cmd) in shell_lookup_commands(cmd) { - match Command::new(shell).args(["-l", "-c", &lookup_cmd]).output() { + let display_command = format!("{shell} -l -c '{lookup_cmd}'"); + let mut command = Command::new(shell); + command.args(["-l", "-c", &lookup_cmd]); + match run_command_with_timeout(command, &display_command, DEFAULT_PROBE_TIMEOUT) { Ok(output) => { let stdout = String::from_utf8_lossy(&output.stdout); if !output.status.success() { @@ -31,12 +43,7 @@ pub fn resolve_binary(cmd: &str) -> ResolvedBinary { " {shell} -l -c '{lookup_cmd}' => {} (resolved)", path.display() )); - let install_source = Some(detect_install_source(path)); - return ResolvedBinary { - path: Some(path.clone()), - search_output: lines.join("\n"), - install_source, - }; + return resolved_binary(path.clone(), &lines, timeouts); } if let Some(path) = candidate_paths.first() { @@ -53,6 +60,17 @@ pub fn resolve_binary(cmd: &str) -> ResolvedBinary { )); } } + Err(CommandError::Timeout { command, timeout }) => { + lines.push(format!( + " {display_command} => timed out after {}", + format_duration(timeout), + )); + timeouts.push(CommandTimeout::new( + format!("{cmd} binary resolution"), + command, + timeout, + )); + } Err(e) => { lines.push(format!(" {shell} -l -c '{lookup_cmd}' => error: {e}")); } @@ -70,12 +88,7 @@ pub fn resolve_binary(cmd: &str) -> ResolvedBinary { let path = PathBuf::from(dir).join(cmd); if is_executable_file(&path) { lines.push(format!(" {} => found (resolved)", path.display())); - let install_source = Some(detect_install_source(&path)); - return ResolvedBinary { - path: Some(path), - search_output: lines.join("\n"), - install_source, - }; + return resolved_binary(path, &lines, timeouts); } lines.push(format!(" {} => not found", path.display())); } @@ -94,12 +107,7 @@ pub fn resolve_binary(cmd: &str) -> ResolvedBinary { let path = dir.join(cmd); if path.exists() { lines.push(format!(" {} => found (resolved)", path.display())); - let install_source = Some(detect_install_source(&path)); - return ResolvedBinary { - path: Some(path), - search_output: lines.join("\n"), - install_source, - }; + return resolved_binary(path, &lines, timeouts); } lines.push(format!(" {} => not found", path.display())); } @@ -111,26 +119,40 @@ pub fn resolve_binary(cmd: &str) -> ResolvedBinary { // but costs one subprocess — only invoked when the static probes above // didn't find the binary). `npm prefix -g` is the version-stable // equivalent of the older `npm bin -g`; the bin dir is `/bin`. - if let Some(npm_bin_dir) = npm_global_bin_dir(&mut lines) { + if let Some(npm_bin_dir) = npm_global_bin_dir(&mut lines, &mut timeouts) { let path = npm_bin_dir.join(cmd); if path.exists() { lines.push(format!(" {} => found (resolved)", path.display())); - let install_source = Some(detect_install_source(&path)); - return ResolvedBinary { - path: Some(path), - search_output: lines.join("\n"), - install_source, - }; + return resolved_binary(path, &lines, timeouts); } lines.push(format!(" {} => not found", path.display())); } lines.push(" not found in any location".to_string()); - ResolvedBinary { - path: None, - search_output: lines.join("\n"), - install_source: None, - } + ( + ResolvedBinary { + path: None, + search_output: lines.join("\n"), + install_source: None, + }, + timeouts, + ) +} + +fn resolved_binary( + path: PathBuf, + lines: &[String], + timeouts: Vec, +) -> (ResolvedBinary, Vec) { + let install_source = Some(detect_install_source(&path)); + ( + ResolvedBinary { + path: Some(path), + search_output: lines.join("\n"), + install_source, + }, + timeouts, + ) } fn shell_lookup_commands(cmd: &str) -> [(&'static str, String); 2] { @@ -216,8 +238,27 @@ fn read_subdirs(parent: &Path) -> Vec { } } -fn npm_global_bin_dir(lines: &mut Vec) -> Option { - let output = Command::new("npm").args(["prefix", "-g"]).output().ok()?; +fn npm_global_bin_dir( + lines: &mut Vec, + timeouts: &mut Vec, +) -> Option { + let mut command = Command::new("npm"); + command.args(["prefix", "-g"]); + let output = match run_command_with_timeout(command, "npm prefix -g", DEFAULT_PROBE_TIMEOUT) { + Ok(output) => output, + Err(CommandError::Timeout { command, timeout }) => { + lines.push(format!( + " npm prefix -g => timed out after {}", + format_duration(timeout), + )); + timeouts.push(CommandTimeout::new("npm global prefix", command, timeout)); + return None; + } + Err(e) => { + lines.push(format!(" npm prefix -g => error: {e}")); + return None; + } + }; if !output.status.success() { lines.push(" npm prefix -g => failed".to_string()); return None; diff --git a/crates/doctor/src/timeout_check.rs b/crates/doctor/src/timeout_check.rs new file mode 100644 index 00000000..e4bfa607 --- /dev/null +++ b/crates/doctor/src/timeout_check.rs @@ -0,0 +1,97 @@ +use std::time::Duration; + +use crate::command::CommandTimeout; +use crate::types::{AgentVersionInfo, AuthStatus, CheckStatus, DoctorCheck, InstallSource}; + +pub(crate) struct TimeoutCheck<'a> { + id: String, + label: String, + status: CheckStatus, + header: &'a str, + command: String, + timeout: Duration, + path: Option, + bridge_path: Option, + install_source: Option, + auth_status: Option, + main: Option, + bridge: Option, + raw_suffix: Option<&'a str>, +} + +impl<'a> TimeoutCheck<'a> { + pub(crate) fn new( + id: impl Into, + label: impl Into, + status: CheckStatus, + header: &'a str, + command: impl Into, + timeout: Duration, + ) -> Self { + Self { + id: id.into(), + label: label.into(), + status, + header, + command: command.into(), + timeout, + path: None, + bridge_path: None, + install_source: None, + auth_status: None, + main: None, + bridge: None, + raw_suffix: None, + } + } + + pub(crate) fn path(mut self, path: Option) -> Self { + self.path = path; + self + } + + pub(crate) fn install_source(mut self, install_source: Option) -> Self { + self.install_source = install_source; + self + } + + pub(crate) fn main(mut self, main: Option) -> Self { + self.main = main; + self + } + + pub(crate) fn raw_suffix(mut self, raw_suffix: Option<&'a str>) -> Self { + self.raw_suffix = raw_suffix; + self + } +} + +pub(crate) fn command_timeout_check(input: TimeoutCheck<'_>) -> DoctorCheck { + let timeout = CommandTimeout::new(input.label.clone(), input.command, input.timeout); + let mut raw = format!("{}\n{}", input.header, timeout.raw_output()); + if let Some(suffix) = input.raw_suffix { + raw.push('\n'); + raw.push_str(suffix); + } + + DoctorCheck { + id: input.id, + label: input.label, + status: input.status, + message: timeout.message(), + fix_url: None, + fix_command: None, + fix_type: None, + path: input.path, + bridge_path: input.bridge_path, + raw_output: Some(raw), + auth_status: input.auth_status, + installed_version: None, + latest_version: None, + update_available: None, + install_source: input.install_source, + self_updating: None, + main: input.main, + bridge: input.bridge, + } +}