diff --git a/go.mod b/go.mod index 3661248..afdefa9 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/project-codeflare/appwrapper -go 1.22.4 +go 1.22.2 require ( github.com/distribution/reference v0.5.0 @@ -21,6 +21,9 @@ require ( sigs.k8s.io/yaml v1.4.0 ) +// On the CF release branch, we want to use ODH fork of Kueue +replace sigs.k8s.io/kueue v0.8.3 => github.com/opendatahub-io/kueue v0.8.3 + require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect diff --git a/go.sum b/go.sum index 8f34f0d..1bdfbe1 100644 --- a/go.sum +++ b/go.sum @@ -136,6 +136,8 @@ github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4/go.mod h1:54/KzLMvA5ndBVpm7B1OjLeV0cUtTLTz2bZ2OtydLpU= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opendatahub-io/kueue v0.8.3 h1:MLkHCmIrQR1KM1IcPiGuoEAT3Y+ZTs7493sMkmSUMow= +github.com/opendatahub-io/kueue v0.8.3/go.mod h1:jzRyUhAXHIpEPjt4pMx79t/Cg1g29GlNZY6wiLJE2YI= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -320,8 +322,6 @@ sigs.k8s.io/jobset v0.5.2 h1:276q5Pi/ErLYj+GQ0ydEXR6tx3LwBhEzHLQv+k8bYF4= sigs.k8s.io/jobset v0.5.2/go.mod h1:Vg99rj/6OoGvy1uvywGEHOcVLCWWJYkJtisKqdWzcFw= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/kueue v0.8.3 h1:H4ZUSWYWnbnUrchunkaeubNVbSEhO8FhyxVlI+IsNLA= -sigs.k8s.io/kueue v0.8.3/go.mod h1:JO87rmNX7d71ZQvCX+TS5H5g1NtSWHDaXXEzbJi6Muk= sigs.k8s.io/kustomize/api v0.17.2 h1:E7/Fjk7V5fboiuijoZHgs4aHuexi5Y2loXlVOAVAG5g= sigs.k8s.io/kustomize/api v0.17.2/go.mod h1:UWTz9Ct+MvoeQsHcJ5e+vziRRkwimm3HytpZgIYqye0= sigs.k8s.io/kustomize/kustomize/v5 v5.3.0 h1:OUKaQwArd1udTz3ykibOjaUwdfly6FnkQiDSSft6+Fg= diff --git a/internal/controller/appwrapper/appwrapper_controller.go b/internal/controller/appwrapper/appwrapper_controller.go index 2a91850..2f57e96 100644 --- a/internal/controller/appwrapper/appwrapper_controller.go +++ b/internal/controller/appwrapper/appwrapper_controller.go @@ -266,21 +266,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Detect externally deleted components and transition to Failed with no GracePeriod or retry - detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) if compStatus.deployed != compStatus.expected { - meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ - Type: string(workloadv1beta2.Unhealthy), - Status: metav1.ConditionTrue, - Reason: "MissingComponent", - Message: detailMsg, - }) - r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) - return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + // There may be a lag before created resources become visible in the cache; don't react too quickly. + whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime + graceDuration := r.admissionGraceDuration(ctx, aw) + if time.Now().After(whenDeployed.Add(graceDuration)) { + detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) + meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ + Type: string(workloadv1beta2.Unhealthy), + Status: metav1.ConditionTrue, + Reason: "MissingComponent", + Message: detailMsg, + }) + r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) + return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + } } // If a component's controller has put it into a failed state, we do not need // to allow a grace period. The situation will not self-correct. - detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed) + detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed) if compStatus.failed > 0 { meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ Type: string(workloadv1beta2.Unhealthy), diff --git a/test/e2e/appwrapper_test.go b/test/e2e/appwrapper_test.go index 362f507..6b808a4 100644 --- a/test/e2e/appwrapper_test.go +++ b/test/e2e/appwrapper_test.go @@ -297,8 +297,13 @@ var _ = Describe("AppWrapper E2E Test", func() { Expect(aw.Status.Retries).Should(Equal(int32(2))) }) - It("Deleting a Running Component yields a failed AppWrapper", func() { - aw := createAppWrapper(ctx, pytorchjob(2, 500)) + It("Deleting a Running Component yields a failed AppWrapper", Label("slow"), func() { + aw := toAppWrapper(pytorchjob(2, 500)) + if aw.Annotations == nil { + aw.Annotations = make(map[string]string) + } + aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation] = "5s" + Expect(getClient(ctx).Create(ctx, aw)).To(Succeed()) appwrappers = append(appwrappers, aw) Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning)) aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace})