From 69bcc0cbd2b1e1d36b0929f998515356a3ff7c5f Mon Sep 17 00:00:00 2001 From: Fabian Wiesel Date: Tue, 17 Mar 2026 12:40:52 +0100 Subject: [PATCH] Onboarding: Timeout on active instance eventually The code currently assumes, that an instance would report the expected string eventually, which assumes that the metadata service lookup works. The active state only reflects that the hypervisor could boot the VM. As the instance usually boots within seconds, a five minute timeout seems save enough to catch this eventuality. --- go.mod | 2 +- internal/controller/onboarding_controller.go | 24 ++++ .../controller/onboarding_controller_test.go | 104 +++++++++++++++++- 3 files changed, 124 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 6953213..5f34b70 100644 --- a/go.mod +++ b/go.mod @@ -104,7 +104,7 @@ require ( k8s.io/component-base v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect - k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 // indirect sigs.k8s.io/gateway-api v1.4.0 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect diff --git a/internal/controller/onboarding_controller.go b/internal/controller/onboarding_controller.go index e92a198..faac00a 100644 --- a/internal/controller/onboarding_controller.go +++ b/internal/controller/onboarding_controller.go @@ -31,6 +31,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/clock" ctrl "sigs.k8s.io/controller-runtime" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" logger "sigs.k8s.io/controller-runtime/pkg/log" @@ -51,6 +52,7 @@ var errRequeue = errors.New("requeue requested") const ( defaultWaitTime = 1 * time.Minute + smokeTestTimeout = 5 * time.Minute testProjectName = "test" testDomainName = "cc3test" testImageName = "cirros-kvm" @@ -63,6 +65,7 @@ type OnboardingController struct { k8sclient.Client Scheme *runtime.Scheme TestFlavorID string + Clock clock.Clock computeClient *gophercloud.ServiceClient testComputeClient *gophercloud.ServiceClient testImageClient *gophercloud.ServiceClient @@ -286,6 +289,23 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis } if !strings.Contains(consoleOutput, server.Name) { + if !server.LaunchedAt.IsZero() && r.Clock.Now().After(server.LaunchedAt.Add(smokeTestTimeout)) { + base := hv.DeepCopy() + meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeOnboarding, + Status: metav1.ConditionTrue, + Reason: kvmv1.ConditionReasonTesting, + Message: fmt.Sprintf("timeout waiting for console output since %v", server.LaunchedAt), + }) + if err := r.patchStatus(ctx, hv, base); err != nil { + return ctrl.Result{}, err + } + if err = servers.Delete(ctx, r.testComputeClient, server.ID).ExtractErr(); err != nil { + if !gophercloud.ResponseCodeIs(err, http.StatusNotFound) { + return ctrl.Result{}, fmt.Errorf("failed to delete timed out test instance %v: %w", server.ID, err) + } + } + } return ctrl.Result{RequeueAfter: defaultWaitTime}, nil } @@ -613,6 +633,10 @@ func (r *OnboardingController) SetupWithManager(mgr ctrl.Manager) error { } r.testNetworkClient.ResourceBase = fmt.Sprintf("%vv2.0/", r.testNetworkClient.Endpoint) + if r.Clock == nil { + r.Clock = clock.RealClock{} + } + return ctrl.NewControllerManagedBy(mgr). Named(OnboardingControllerName). For(&kvmv1.Hypervisor{}). diff --git a/internal/controller/onboarding_controller_test.go b/internal/controller/onboarding_controller_test.go index 2a74376..962d66e 100644 --- a/internal/controller/onboarding_controller_test.go +++ b/internal/controller/onboarding_controller_test.go @@ -21,6 +21,7 @@ import ( "fmt" "net/http" "os" + "time" "github.com/gophercloud/gophercloud/v2/testhelper" "github.com/gophercloud/gophercloud/v2/testhelper/client" @@ -30,6 +31,8 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/clock" + clocktesting "k8s.io/utils/clock/testing" ctrl "sigs.k8s.io/controller-runtime" kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" @@ -221,6 +224,7 @@ var _ = Describe("Onboarding Controller", func() { Client: k8sClient, Scheme: k8sClient.Scheme(), TestFlavorID: "1", + Clock: clock.RealClock{}, computeClient: client.ServiceClient(fakeServer), testComputeClient: client.ServiceClient(fakeServer), testImageClient: client.ServiceClient(fakeServer), @@ -340,6 +344,12 @@ var _ = Describe("Onboarding Controller", func() { }) Context("running tests after initial setup", func() { + var ( + serverActionHandler func(http.ResponseWriter, *http.Request) + serverDeleteHandler func(http.ResponseWriter, *http.Request) + serverDetailHandler func(http.ResponseWriter, *http.Request) + ) + BeforeEach(func(ctx SpecContext) { hv := &kvmv1.Hypervisor{} Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed()) @@ -371,11 +381,14 @@ var _ = Describe("Onboarding Controller", func() { Expect(err).NotTo(HaveOccurred()) }) + serverDetailHandler = func(w http.ResponseWriter, _ *http.Request) { + _, err := fmt.Fprint(w, emptyServersBody) + Expect(err).NotTo(HaveOccurred()) + } fakeServer.Mux.HandleFunc("GET /servers/detail", func(w http.ResponseWriter, r *http.Request) { w.Header().Add("Content-Type", "application/json") w.WriteHeader(http.StatusOK) - _, err := fmt.Fprint(w, emptyServersBody) - Expect(err).NotTo(HaveOccurred()) + serverDetailHandler(w, r) }) fakeServer.Mux.HandleFunc("POST /instance-ha", func(w http.ResponseWriter, r *http.Request) { @@ -406,15 +419,21 @@ var _ = Describe("Onboarding Controller", func() { Expect(err).NotTo(HaveOccurred()) }) - fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) { + serverActionHandler = func(w http.ResponseWriter, _ *http.Request) { w.Header().Add("Content-Type", "application/json") w.WriteHeader(http.StatusOK) _, err := fmt.Fprintf(w, `{"output": "FAKE CONSOLE OUTPUT\nANOTHER\nLAST LINE\nohooc--%v-%v\n"}`, hv.Name, hv.UID) Expect(err).NotTo(HaveOccurred()) - + } + fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) { + serverActionHandler(w, r) }) - fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) { + + serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusAccepted) + } + fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) { + serverDeleteHandler(w, r) }) }) @@ -571,5 +590,80 @@ var _ = Describe("Onboarding Controller", func() { }) }) + When("smoke test times out waiting for console output", func() { + var serverDeletedCalled bool + + BeforeEach(func(ctx SpecContext) { + By("Overriding HV status to Testing state") + hv := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed()) + meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeOnboarding, + Status: metav1.ConditionTrue, + Reason: kvmv1.ConditionReasonTesting, + }) + Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed()) + + // Construct the server name the controller will look for. + serverName := fmt.Sprintf("%s-%s-%s", testPrefixName, hypervisorName, string(hv.UID)) + detailCallCount := 0 + + // On the first reconcile GET /servers/detail returns empty so the controller + // creates the server via POST (no launched_at → timeout cannot fire yet). + // On the second reconcile GET /servers/detail returns the ACTIVE server with a + // stale launched_at so createOrGetTestServer takes the "already-running" path and + // smokeTest fires the timeout. + serverDetailHandler = func(w http.ResponseWriter, _ *http.Request) { + if detailCallCount == 0 { + detailCallCount++ + _, err := fmt.Fprint(w, emptyServersBody) + Expect(err).NotTo(HaveOccurred()) + } else { + _, err := fmt.Fprintf(w, + `{"servers": [{"id": "server-id", "name": %q, "status": "ACTIVE", "OS-SRV-USG:launched_at": "2025-01-01T12:00:00.000000"}], "servers_links": []}`, + serverName) + Expect(err).NotTo(HaveOccurred()) + } + } + + // Set the clock to 6 minutes after the launched_at above (past the 5-minute deadline). + onboardingReconciler.Clock = clocktesting.NewFakeClock(time.Date(2025, 1, 1, 12, 6, 0, 0, time.UTC)) + serverDeletedCalled = false + + // Console output that does NOT contain the server name, so the timeout path is exercised. + serverActionHandler = func(w http.ResponseWriter, _ *http.Request) { + w.Header().Add("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, err := fmt.Fprint(w, `{"output": "some unrelated console output\n"}`) + Expect(err).NotTo(HaveOccurred()) + } + serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) { + serverDeletedCalled = true + w.WriteHeader(http.StatusAccepted) + } + }) + + It("should delete the stalled server and record a timeout in the status", func(ctx SpecContext) { + By("First reconcile: controller creates the ACTIVE server; launched_at is absent so timeout does not fire yet") + _, err := onboardingReconciler.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + + By("Second reconcile: GET /servers/detail returns the stale server; timeout fires and the server is deleted") + _, err = onboardingReconciler.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying the timed-out server was deleted") + Expect(serverDeletedCalled).To(BeTrue()) + + By("Verifying the onboarding condition message indicates a timeout") + hv := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed()) + onboardingCond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding) + Expect(onboardingCond).NotTo(BeNil()) + Expect(onboardingCond.Reason).To(Equal(kvmv1.ConditionReasonTesting)) + Expect(onboardingCond.Message).To(ContainSubstring("timeout")) + }) + }) + }) })