Skip to content

Commit 1669faa

Browse files
authored
fix: commitment change API integration test (#592)
- fix hypervisor crd Tilt reference - make test with proper summary based on gotestsum - Adding integration tests for commitment change API - Commitment change API handles more corner cases - commitment config added - moving endpoint to "/v1/commitments/..."
1 parent 03638cf commit 1669faa

10 files changed

Lines changed: 1791 additions & 217 deletions

File tree

Makefile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
2828
test: ## Run all tests.
2929
go test ./...
3030

31+
.PHONY: testsum
32+
testsum: gotestsum ## Run all tests (clean output for passing, verbose for failing). Options: WATCH=1, RUN=<pattern>, PACKAGE=<pkg>, FORMAT=<fmt> (e.g., standard-verbose for all output)
33+
$(GOTESTSUM) \
34+
$(if $(WATCH),--watch) \
35+
--format $(if $(FORMAT),$(FORMAT),testname) \
36+
--hide-summary=all \
37+
-- \
38+
$(if $(VERBOSE),-v) \
39+
$(if $(RUN),-run $(RUN)) \
40+
$(if $(PACKAGE),$(PACKAGE),./...)
41+
3142
.PHONY: generate
3243
generate: deepcopy crds ## Regenerate CRDs and DeepCopy after API type changes.
3344

@@ -45,9 +56,11 @@ $(LOCALBIN):
4556

4657
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
4758
GOLANGCI_LINT = $(LOCALBIN)/golangci-lint
59+
GOTESTSUM = $(LOCALBIN)/gotestsum
4860

4961
CONTROLLER_TOOLS_VERSION ?= v0.20.0
5062
GOLANGCI_LINT_VERSION ?= v2.9.0
63+
GOTESTSUM_VERSION ?= v1.13.0
5164

5265
.PHONY: controller-gen
5366
controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary.
@@ -59,6 +72,11 @@ golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
5972
$(GOLANGCI_LINT): $(LOCALBIN)
6073
$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/v2/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION))
6174

75+
.PHONY: gotestsum
76+
gotestsum: $(GOTESTSUM) ## Download gotestsum locally if necessary.
77+
$(GOTESTSUM): $(LOCALBIN)
78+
$(call go-install-tool,$(GOTESTSUM),gotest.tools/gotestsum,$(GOTESTSUM_VERSION))
79+
6280
# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
6381
# $1 - target path with name of binary
6482
# $2 - package url which can be installed

api/v1alpha1/reservation_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ type CommittedResourceReservationSpec struct {
5454
// +kubebuilder:validation:Optional
5555
ResourceName string `json:"resourceName,omitempty"`
5656

57+
// CommitmentUUID is the UUID of the commitment that this reservation corresponds to.
58+
// +kubebuilder:validation:Optional
59+
CommitmentUUID string `json:"commitmentUUID,omitempty"`
60+
5761
// ResourceGroup is the group/category of the resource (e.g., flavor group for Nova)
5862
// +kubebuilder:validation:Optional
5963
ResourceGroup string `json:"resourceGroup,omitempty"`

docs/develop.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,21 @@ Cortex is developed using the Go programming language. To get started with the d
3434

3535
Run `make` in your terminal from the cortex root directory to perform linting and testing tasks.
3636

37+
### Working on Tests
38+
39+
```bash
40+
# Watch mode for continuous testing; print logs for failed tests only
41+
make testsum WATCH=1
42+
```
43+
44+
The `testsum` target provides cleaner output by showing only full verbose output for failing tests.
45+
46+
**Available options:**
47+
- `WATCH=1` - Automatically re-run tests when files change
48+
- `RUN=<pattern>` - Run specific tests matching the pattern
49+
- `PACKAGE=<pkg>` - Test specific package(s)
50+
- `FORMAT=<fmt>` - Change output format (e.g., `standard-verbose` for verbose output on all tests)
51+
3752
## Helm Charts
3853

3954
Helm charts bundle the application into a package, containing all the [Kubernetes](https://kubernetes.io/docs/tutorials/hello-minikube/) resources needed to run the application. The configuration for the application is specified in the [Helm `values.yaml`](cortex.secrets.example.yaml).

helm/library/cortex/files/crds/cortex.cloud_reservations.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ spec:
8787
Key: Workload UUID (VM UUID for Nova, Pod UID for Pods, Machine UID for IronCore, etc.)
8888
Value: allocation state and metadata
8989
type: object
90+
commitmentUUID:
91+
description: CommitmentUUID is the UUID of the commitment that
92+
this reservation corresponds to.
93+
type: string
9094
creator:
9195
description: |-
9296
Creator identifies the system or component that created this reservation.

internal/scheduling/reservations/commitments/api.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,26 @@ import (
1414
// HTTPAPI implements Limes LIQUID commitment validation endpoints.
1515
type HTTPAPI struct {
1616
client client.Client
17+
config Config
1718
// Mutex to serialize change-commitments requests
1819
changeMutex sync.Mutex
1920
}
2021

2122
func NewAPI(client client.Client) *HTTPAPI {
23+
return NewAPIWithConfig(client, DefaultConfig())
24+
}
25+
26+
func NewAPIWithConfig(client client.Client, config Config) *HTTPAPI {
2227
return &HTTPAPI{
2328
client: client,
29+
config: config,
2430
}
2531
}
2632

2733
func (api *HTTPAPI) Init(mux *http.ServeMux) {
28-
mux.HandleFunc("/v1/change-commitments", api.HandleChangeCommitments)
34+
mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
2935
// mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity)
30-
mux.HandleFunc("/v1/info", api.HandleInfo)
36+
mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
3137
}
3238

3339
var commitmentApiLog = ctrl.Log.WithName("commitment_api")

internal/scheduling/reservations/commitments/api_change_commitments.go

Lines changed: 77 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -9,27 +9,32 @@ import (
99
"errors"
1010
"fmt"
1111
"net/http"
12+
"sort"
13+
"strings"
1214
"time"
1315

1416
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
1517
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
1618
"github.com/go-logr/logr"
1719
. "github.com/majewsky/gg/option"
1820
"github.com/sapcc/go-api-declarations/liquid"
19-
apierrors "k8s.io/apimachinery/pkg/api/errors"
2021
"k8s.io/apimachinery/pkg/api/meta"
2122
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2223
"k8s.io/apimachinery/pkg/types"
2324
"sigs.k8s.io/controller-runtime/pkg/client"
2425
)
2526

26-
const (
27-
// watchTimeout is how long to wait for all reservations to become ready
28-
watchTimeout = 20 * time.Second
29-
30-
// pollInterval is how frequently to poll reservation status
31-
pollInterval = 1 * time.Second
32-
)
27+
// sortedKeys returns map keys sorted alphabetically for deterministic iteration.
28+
func sortedKeys[K ~string, V any](m map[K]V) []K {
29+
keys := make([]K, 0, len(m))
30+
for k := range m {
31+
keys = append(keys, k)
32+
}
33+
sort.Slice(keys, func(i, j int) bool {
34+
return string(keys[i]) < string(keys[j])
35+
})
36+
return keys
37+
}
3338

3439
// implements POST /v1/change-commitments from Limes LIQUID API:
3540
// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go
@@ -99,6 +104,7 @@ func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Log
99104
ctx := context.Background()
100105
manager := NewReservationManager(api.client)
101106
requireRollback := false
107+
failedCommitments := make(map[string]string) // commitmentUUID to reason for failure, for better response messages in case of rollback
102108
log.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject))
103109

104110
knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client}
@@ -135,8 +141,10 @@ func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Log
135141
}
136142

137143
ProcessLoop:
138-
for projectID, projectChanges := range req.ByProject {
139-
for resourceName, resourceChanges := range projectChanges.ByResource {
144+
for _, projectID := range sortedKeys(req.ByProject) {
145+
projectChanges := req.ByProject[projectID]
146+
for _, resourceName := range sortedKeys(projectChanges.ByResource) {
147+
resourceChanges := projectChanges.ByResource[resourceName]
140148
// Validate resource name pattern (instances_group_*)
141149
flavorGroupName, err := getFlavorGroupNameFromResource(string(resourceName))
142150
if err != nil {
@@ -157,14 +165,16 @@ ProcessLoop:
157165
// Additional per-commitment validation if needed
158166
log.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none"))
159167

168+
// TODO add configurable upper limit validation for commitment size (number of instances) to prevent excessive reservation creation
160169
// TODO add domain
161170

162171
// List all committed resource reservations, then filter by name prefix
163172
var all_reservations v1alpha1.ReservationList
164173
if err := api.client.List(ctx, &all_reservations, client.MatchingLabels{
165174
v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
166175
}); err != nil {
167-
resp.RejectionReason = fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err)
176+
failedCommitments[string(commitment.UUID)] = "failed to list reservations"
177+
log.Info(fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err))
168178
requireRollback = true
169179
break ProcessLoop
170180
}
@@ -189,7 +199,8 @@ ProcessLoop:
189199
} else {
190200
stateBefore, err = FromReservations(existing_reservations.Items)
191201
if err != nil {
192-
resp.RejectionReason = fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err)
202+
failedCommitments[string(commitment.UUID)] = "failed to parse existing commitment reservations"
203+
log.Info(fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err))
193204
requireRollback = true
194205
break ProcessLoop
195206
}
@@ -199,7 +210,8 @@ ProcessLoop:
199210
// get desired state
200211
stateDesired, err := FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ))
201212
if err != nil {
202-
resp.RejectionReason = fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err)
213+
failedCommitments[string(commitment.UUID)] = "failed to determine desired commitment state"
214+
log.Info(fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err))
203215
requireRollback = true
204216
break ProcessLoop
205217
}
@@ -208,7 +220,8 @@ ProcessLoop:
208220

209221
touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, log, stateDesired, flavorGroups, "changeCommitmentsApi")
210222
if err != nil {
211-
resp.RejectionReason = fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err)
223+
failedCommitments[string(commitment.UUID)] = "failed to apply commitment state"
224+
log.Info(fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err))
212225
requireRollback = true
213226
break ProcessLoop
214227
}
@@ -224,17 +237,34 @@ ProcessLoop:
224237

225238
time_start := time.Now()
226239

227-
if err := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, watchTimeout); err != nil {
240+
if failedReservations, errors := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, api.config.ChangeAPIWatchReservationsTimeout, api.config.ChangeAPIWatchReservationsPollInterval); len(failedReservations) > 0 || len(errors) > 0 {
228241
log.Info("reservations failed to become ready, initiating rollback",
229-
"reason", err.Error())
230-
resp.RejectionReason = fmt.Sprintf("Not all reservations can be fulfilled: %v", err)
242+
"failedReservations", len(failedReservations),
243+
"errors", errors)
244+
245+
for _, res := range failedReservations {
246+
failedCommitments[res.Spec.CommittedResourceReservation.CommitmentUUID] = "not sufficient capacity"
247+
}
248+
if len(failedReservations) == 0 {
249+
resp.RejectionReason += "timeout reached while processing commitment changes"
250+
}
231251
requireRollback = true
232252
}
233253

234254
log.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds())
235255
}
236256

237257
if requireRollback {
258+
// Build rejection reason from failed commitments
259+
if len(failedCommitments) > 0 {
260+
var reasonBuilder strings.Builder
261+
reasonBuilder.WriteString(fmt.Sprintf("%d commitment(s) failed to apply: ", len(failedCommitments)))
262+
for commitmentUUID, reason := range failedCommitments {
263+
reasonBuilder.WriteString(fmt.Sprintf("\n- commitment %s: %s", commitmentUUID, reason))
264+
}
265+
resp.RejectionReason = reasonBuilder.String()
266+
}
267+
238268
log.Info("rollback of commitment changes")
239269
for commitmentUUID, state := range statesBefore {
240270
// Rollback to statesBefore for this commitment
@@ -247,16 +277,10 @@ ProcessLoop:
247277
}
248278

249279
log.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason)
250-
251-
// TODO improve human-readable reasoning based on actual failure, i.e. polish resp.RejectionReason
252280
return nil
253281
}
254282

255283
log.Info("commitment changes accepted")
256-
if resp.RejectionReason != "" {
257-
log.Info("unexpected non-empty rejection reason without rollback", "reason", resp.RejectionReason)
258-
resp.RejectionReason = ""
259-
}
260284
return nil
261285
}
262286

@@ -267,23 +291,28 @@ func watchReservationsUntilReady(
267291
k8sClient client.Client,
268292
reservations []v1alpha1.Reservation,
269293
timeout time.Duration,
270-
) error {
294+
pollInterval time.Duration,
295+
) (failedReservations []v1alpha1.Reservation, errors []error) {
271296

272297
if len(reservations) == 0 {
273-
return nil
298+
return failedReservations, nil
274299
}
275300

276301
deadline := time.Now().Add(timeout)
277302

303+
reservationsToWatch := make([]v1alpha1.Reservation, len(reservations))
304+
copy(reservationsToWatch, reservations)
305+
278306
for {
307+
var stillWaiting []v1alpha1.Reservation
279308
if time.Now().After(deadline) {
280-
return fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout)
309+
errors = append(errors, fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout))
310+
return failedReservations, errors
281311
}
282312

283-
allReady := true
284-
var notReadyReasons []string
313+
allChecked := true
285314

286-
for _, res := range reservations {
315+
for _, res := range reservationsToWatch {
287316
// Fetch current state
288317
var current v1alpha1.Reservation
289318
nn := types.NamespacedName{
@@ -292,12 +321,11 @@ func watchReservationsUntilReady(
292321
}
293322

294323
if err := k8sClient.Get(ctx, nn, &current); err != nil {
295-
if apierrors.IsNotFound(err) {
296-
// Reservation is still in process of being created
297-
allReady = false
298-
continue
299-
}
300-
return fmt.Errorf("failed to get reservation %s: %w", res.Name, err)
324+
allChecked = false
325+
// Reservation is still in process of being created, or there is a transient error, continue waiting for it
326+
log.V(1).Info("transient error getting reservation, will retry", "reservation", res.Name, "error", err)
327+
stillWaiting = append(stillWaiting, res)
328+
continue
301329
}
302330

303331
// Check Ready condition
@@ -308,37 +336,33 @@ func watchReservationsUntilReady(
308336

309337
if readyCond == nil {
310338
// Condition not set yet, keep waiting
311-
allReady = false
312-
notReadyReasons = append(notReadyReasons,
313-
res.Name+": condition not set")
339+
allChecked = false
340+
stillWaiting = append(stillWaiting, res)
314341
continue
315342
}
316343

317344
switch readyCond.Status {
318345
case metav1.ConditionTrue:
319-
// This reservation is ready
320-
continue
346+
// TODO use more than readyCondition
321347
case metav1.ConditionFalse:
322-
// Explicit failure - stop immediately
323-
return fmt.Errorf("reservation %s failed: %s (reason: %s)",
324-
res.Name, readyCond.Message, readyCond.Reason)
348+
allChecked = false
349+
failedReservations = append(failedReservations, res)
325350
case metav1.ConditionUnknown:
326-
// Still processing
327-
allReady = false
328-
notReadyReasons = append(notReadyReasons,
329-
fmt.Sprintf("%s: %s", res.Name, readyCond.Message))
351+
allChecked = false
352+
stillWaiting = append(stillWaiting, res)
330353
}
331354
}
332355

333-
if allReady {
334-
log.Info("all reservations are ready",
335-
"count", len(reservations))
336-
return nil
356+
if allChecked || len(stillWaiting) == 0 {
357+
log.Info("all reservations checked",
358+
"failed", len(failedReservations))
359+
return failedReservations, errors
337360
}
338361

362+
reservationsToWatch = stillWaiting
339363
// Log progress
340364
log.Info("waiting for reservations to become ready",
341-
"notReady", len(notReadyReasons),
365+
"notReady", len(reservationsToWatch),
342366
"total", len(reservations),
343367
"timeRemaining", time.Until(deadline).Round(time.Second))
344368

@@ -347,7 +371,7 @@ func watchReservationsUntilReady(
347371
case <-time.After(pollInterval):
348372
// Continue polling
349373
case <-ctx.Done():
350-
return fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err())
374+
return failedReservations, append(errors, fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err()))
351375
}
352376
}
353377
}

0 commit comments

Comments
 (0)