From 126b210b4dcab0ecbf078fe5a7ad781c4c2abd50 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 10 Apr 2026 11:27:59 -0400 Subject: [PATCH] Add failure diagnostics for cloud-hypervisor startup --- lib/hypervisor/cloudhypervisor/process.go | 103 ++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/lib/hypervisor/cloudhypervisor/process.go b/lib/hypervisor/cloudhypervisor/process.go index 4fe1e10d..0618de43 100644 --- a/lib/hypervisor/cloudhypervisor/process.go +++ b/lib/hypervisor/cloudhypervisor/process.go @@ -2,7 +2,13 @@ package cloudhypervisor import ( "context" + "errors" "fmt" + "log/slog" + "net" + "os" + "path/filepath" + "strings" "syscall" "time" @@ -55,6 +61,8 @@ func (s *Starter) GetVersion(p *paths.Paths) (string, error) { // StartVM launches Cloud Hypervisor, configures the VM, and boots it. // Returns the process ID and a Hypervisor client for subsequent operations. func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { + log := logger.FromContext(ctx) + // Validate version chVersion := vmm.CHVersion(version) if !vmm.IsVersionSupported(chVersion) { @@ -85,18 +93,22 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s vmConfig := ToVMConfig(config) resp, err := hv.client.CreateVMWithResponse(ctx, vmConfig) if err != nil { + logStartVMFailureDiagnostics(ctx, log, socketPath, pid, "create_vm", err, 0, "") return 0, nil, fmt.Errorf("create vm: %w", err) } if resp.StatusCode() != 204 { + logStartVMFailureDiagnostics(ctx, log, socketPath, pid, "create_vm", nil, resp.StatusCode(), string(resp.Body)) return 0, nil, fmt.Errorf("create vm failed with status %d: %s", resp.StatusCode(), string(resp.Body)) } // 4. Boot the VM via HTTP API bootResp, err := hv.client.BootVMWithResponse(ctx) if err != nil { + logStartVMFailureDiagnostics(ctx, log, socketPath, pid, "boot_vm", err, 0, "") return 0, nil, fmt.Errorf("boot vm: %w", err) } if bootResp.StatusCode() != 204 { + logStartVMFailureDiagnostics(ctx, log, socketPath, pid, "boot_vm", nil, bootResp.StatusCode(), string(bootResp.Body)) return 0, nil, fmt.Errorf("boot vm failed with status %d: %s", bootResp.StatusCode(), string(bootResp.Body)) } @@ -164,3 +176,94 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, func ptr[T any](v T) *T { return &v } + +func logStartVMFailureDiagnostics(ctx context.Context, log *slog.Logger, socketPath string, pid int, operation string, requestErr error, statusCode int, responseBody string) { + if log == nil { + return + } + + socketExists := false + if _, err := os.Stat(socketPath); err == nil { + socketExists = true + } + + socketDialable, socketDialErr := canDialUnixSocket(socketPath) + processRunning := false + if pid > 0 && syscall.Kill(pid, 0) == nil { + processRunning = true + } + + ctxErr := ctx.Err() + ctxCause := context.Cause(ctx) + deadline, hasDeadline := ctx.Deadline() + + attrs := []any{ + "operation", operation, + "socket_path", socketPath, + "socket_exists", socketExists, + "socket_dialable", socketDialable, + "pid", pid, + "process_running", processRunning, + "ctx_err", errorString(ctxErr), + "ctx_cause", errorString(ctxCause), + "vmm_log_tail", tailFile(filepath.Join(filepath.Dir(socketPath), "logs", "vmm.log"), 4096), + } + if hasDeadline { + attrs = append(attrs, + "ctx_deadline", deadline.UTC().Format(time.RFC3339Nano), + "ctx_deadline_in_ms", time.Until(deadline).Milliseconds(), + ) + } + if socketDialErr != nil { + attrs = append(attrs, "socket_dial_error", socketDialErr.Error()) + } + if requestErr != nil { + attrs = append(attrs, + "request_error", requestErr.Error(), + "request_error_is_context_canceled", errors.Is(requestErr, context.Canceled), + "request_error_is_deadline_exceeded", errors.Is(requestErr, context.DeadlineExceeded), + ) + } + if statusCode > 0 { + attrs = append(attrs, "response_status_code", statusCode) + } + if responseBody != "" { + attrs = append(attrs, "response_body", truncateString(responseBody, 1024)) + } + + log.ErrorContext(ctx, "cloud-hypervisor start_vm diagnostics", attrs...) +} + +func canDialUnixSocket(socketPath string) (bool, error) { + conn, err := net.DialTimeout("unix", socketPath, 100*time.Millisecond) + if err != nil { + return false, err + } + _ = conn.Close() + return true, nil +} + +func tailFile(path string, maxBytes int) string { + data, err := os.ReadFile(path) + if err != nil { + return fmt.Sprintf("unavailable: %v", err) + } + if len(data) > maxBytes { + data = data[len(data)-maxBytes:] + } + return truncateString(strings.TrimSpace(string(data)), maxBytes) +} + +func truncateString(s string, max int) string { + if max <= 0 || len(s) <= max { + return s + } + return s[:max] + "...(truncated)" +} + +func errorString(err error) string { + if err == nil { + return "" + } + return err.Error() +}