Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
aab106b
backlevel go version in go.mod for codeflare
dgrove-oss Jul 12, 2024
ad486fb
Merge branch 'main' into codeflare-releases
dgrove-oss Jul 31, 2024
bfaac68
merge main up to v0.24.0
dgrove-oss Aug 28, 2024
30a8af1
merge main up to v0.25.0
dgrove-oss Sep 26, 2024
d3825c2
Merge main up to 0.26.0 into codeflare-releases
dgrove-oss Oct 16, 2024
f71c632
Merge branch 'main' into codeflare-releases for cf0.27.0
dgrove-oss Nov 5, 2024
d2ee6aa
add replace directive for kueue to make code buildable
dgrove-oss Nov 15, 2024
3ef1028
run main CI job on codeflare-releases branch (#270)
dgrove-oss Nov 15, 2024
7cbccf8
Use ODH fork of Kueue on the codeflare-releases branch (#272)
dgrove-oss Nov 18, 2024
94d6de5
update mlbatch step of release instructions (#265)
dgrove-oss Nov 5, 2024
d126ebb
bump version of action-gh-release to upgrade to Node.js 20 (#266)
dgrove-oss Nov 5, 2024
b0a3eb9
bump kind to 0.25; simplify using different k8s versions for cluster …
dgrove-oss Nov 11, 2024
86987f1
avoid blanket override of namespace when kustomizing kueue (#269)
dgrove-oss Nov 12, 2024
d5a100b
add initial grace period before triggering a failure due to missing c…
dgrove-oss Nov 27, 2024
fbb0c6b
rename user-queue to default-queue to match mlbatch conventions (#274)
dgrove-oss Nov 27, 2024
515c0a3
Ensure consistent resource status if create errors (#277)
dgrove-oss Dec 12, 2024
b6bd577
flag config error if admissionGP exceeds warmupGP (#278)
dgrove-oss Dec 12, 2024
82cca1b
prepare for 0.28 release (#279)
dgrove-oss Dec 12, 2024
b7813ce
Merge branch 'main' into cf-029
dgrove-oss Dec 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/project-codeflare/appwrapper

go 1.22.4
go 1.22.2

require (
github.com/distribution/reference v0.5.0
Expand All @@ -21,6 +21,9 @@ require (
sigs.k8s.io/yaml v1.4.0
)

// On the CF release branch, we want to use ODH fork of Kueue
replace sigs.k8s.io/kueue v0.8.3 => github.com/opendatahub-io/kueue v0.8.3

require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe
github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4/go.mod h1:54/KzLMvA5ndBVpm7B1OjLeV0cUtTLTz2bZ2OtydLpU=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opendatahub-io/kueue v0.8.3 h1:MLkHCmIrQR1KM1IcPiGuoEAT3Y+ZTs7493sMkmSUMow=
github.com/opendatahub-io/kueue v0.8.3/go.mod h1:jzRyUhAXHIpEPjt4pMx79t/Cg1g29GlNZY6wiLJE2YI=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
Expand Down Expand Up @@ -320,8 +322,6 @@ sigs.k8s.io/jobset v0.5.2 h1:276q5Pi/ErLYj+GQ0ydEXR6tx3LwBhEzHLQv+k8bYF4=
sigs.k8s.io/jobset v0.5.2/go.mod h1:Vg99rj/6OoGvy1uvywGEHOcVLCWWJYkJtisKqdWzcFw=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/kueue v0.8.3 h1:H4ZUSWYWnbnUrchunkaeubNVbSEhO8FhyxVlI+IsNLA=
sigs.k8s.io/kueue v0.8.3/go.mod h1:JO87rmNX7d71ZQvCX+TS5H5g1NtSWHDaXXEzbJi6Muk=
sigs.k8s.io/kustomize/api v0.17.2 h1:E7/Fjk7V5fboiuijoZHgs4aHuexi5Y2loXlVOAVAG5g=
sigs.k8s.io/kustomize/api v0.17.2/go.mod h1:UWTz9Ct+MvoeQsHcJ5e+vziRRkwimm3HytpZgIYqye0=
sigs.k8s.io/kustomize/kustomize/v5 v5.3.0 h1:OUKaQwArd1udTz3ykibOjaUwdfly6FnkQiDSSft6+Fg=
Expand Down
25 changes: 15 additions & 10 deletions internal/controller/appwrapper/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,21 +266,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}

// Detect externally deleted components and transition to Failed with no GracePeriod or retry
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
if compStatus.deployed != compStatus.expected {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "MissingComponent",
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
// There may be a lag before created resources become visible in the cache; don't react too quickly.
whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
graceDuration := r.admissionGraceDuration(ctx, aw)
if time.Now().After(whenDeployed.Add(graceDuration)) {
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "MissingComponent",
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
}
}

// If a component's controller has put it into a failed state, we do not need
// to allow a grace period. The situation will not self-correct.
detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed)
detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed)
if compStatus.failed > 0 {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Expand Down
9 changes: 7 additions & 2 deletions test/e2e/appwrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,13 @@ var _ = Describe("AppWrapper E2E Test", func() {
Expect(aw.Status.Retries).Should(Equal(int32(2)))
})

It("Deleting a Running Component yields a failed AppWrapper", func() {
aw := createAppWrapper(ctx, pytorchjob(2, 500))
It("Deleting a Running Component yields a failed AppWrapper", Label("slow"), func() {
aw := toAppWrapper(pytorchjob(2, 500))
if aw.Annotations == nil {
aw.Annotations = make(map[string]string)
}
aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation] = "5s"
Expect(getClient(ctx).Create(ctx, aw)).To(Succeed())
appwrappers = append(appwrappers, aw)
Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning))
aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace})
Expand Down
Loading