From 84f871216a6f077f4d59976ad105fcd0ae44eff6 Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 19 Mar 2026 08:03:58 +0100 Subject: [PATCH 1/3] commitment package updated: - commitment uuid test - reservation deletion respects allocations and targetHost - fix pipeline - set time correctly - refactor package and controller - refactor logging --- cmd/main.go | 11 +- .../cortex-nova/templates/pipelines_kvm.yaml | 253 ++++++++++++ helm/bundles/cortex-nova/values.yaml | 6 + .../commitments/api_change_commitments.go | 100 +++-- .../api_change_commitments_test.go | 117 +++++- .../reservations/commitments/config.go | 56 ++- .../reservations/commitments/context.go | 25 ++ .../{controller => commitments}/controller.go | 296 +++++++------- .../controller_test.go | 44 +- .../commitments/reservation_manager.go | 18 +- .../commitments/reservation_manager_test.go | 387 ++++++++++++++---- .../reservations/commitments/state.go | 28 +- .../reservations/{controller => }/monitor.go | 4 +- .../{controller => }/monitor_test.go | 16 +- 14 files changed, 1027 insertions(+), 334 deletions(-) create mode 100644 internal/scheduling/reservations/commitments/context.go rename internal/scheduling/reservations/{controller => commitments}/controller.go (61%) rename internal/scheduling/reservations/{controller => commitments}/controller_test.go (86%) rename internal/scheduling/reservations/{controller => }/monitor.go (97%) rename internal/scheduling/reservations/{controller => }/monitor_test.go (97%) diff --git a/cmd/main.go b/cmd/main.go index f8188f93a..5b04c251d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -51,7 +51,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/pods" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" - reservationscontroller "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/controller" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/monitoring" @@ -487,16 +486,16 @@ func main() { } if slices.Contains(mainConfig.EnabledControllers, "reservations-controller") { setupLog.Info("enabling controller", "controller", "reservations-controller") - monitor := reservationscontroller.NewControllerMonitor(multiclusterClient) + monitor := reservations.NewMonitor(multiclusterClient) metrics.Registry.MustRegister(&monitor) - reservationsControllerConfig := conf.GetConfigOrDie[reservationscontroller.Config]() + commitmentsConfig := conf.GetConfigOrDie[commitments.Config]() - if err := (&reservationscontroller.ReservationReconciler{ + if err := (&commitments.CommitmentReservationController{ Client: multiclusterClient, Scheme: mgr.GetScheme(), - Conf: reservationsControllerConfig, + Conf: commitmentsConfig, }).SetupWithManager(mgr, multiclusterClient); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Reservation") + setupLog.Error(err, "unable to create controller", "controller", "CommitmentReservation") os.Exit(1) } } diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 5e5190ab3..934cc6dcb 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -664,4 +664,257 @@ spec: requested by the nova flavor extra specs, like `{"arch": "x86_64", "maxphysaddr:bits": 46, ...}`. weighers: [] +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Pipeline +metadata: + name: kvm-committed-resource-reservation-general-purpose +spec: + schedulingDomain: nova + description: | + This pipeline is used for placing committed resource (CR) reservations + for general purpose workloads. It uses the same filtering + as regular VM scheduling to ensure proper placement, considering all + existing VMs, capacity constraints, traits, and other requirements. + + Key difference from regular VM scheduling: reserved capacity is kept + locked (lockReserved: true) to prevent CR reservations from overlapping + with each other, even for the same project. + + This is the pipeline used for KVM hypervisors (qemu and cloud-hypervisor). + Specifically, this pipeline uses load balancing for general purpose workloads. + type: filter-weigher + createDecisions: true + # Fetch all placement candidates, ignoring nova's preselection. + ignorePreselection: true + filters: + - name: filter_host_instructions + description: | + This step will consider the `ignore_hosts` and `force_hosts` instructions + from the nova scheduler request spec to filter out or exclusively allow + certain hosts. + - name: filter_has_enough_capacity + description: | + This step will filter out hosts that do not have enough available capacity + to host the requested flavor. Reserved space is kept locked to avoid + CR reservations overlapping. + params: + - {key: lockReserved, boolValue: true} + - name: filter_has_requested_traits + description: | + This step filters hosts that do not have the requested traits given by the + nova flavor extra spec: "trait:": "forbidden" means the host must + not have the specified trait. "trait:": "required" means the host + must have the specified trait. + - name: filter_has_accelerators + description: | + This step will filter out hosts without the trait `COMPUTE_ACCELERATORS` if + the nova flavor extra specs request accelerators via "accel:device_profile". + - name: filter_correct_az + description: | + This step will filter out hosts whose aggregate information indicates they + are not placed in the requested availability zone. + - name: filter_status_conditions + description: | + This step will filter out hosts for which the hypervisor status conditions + do not meet the expected values, for example, that the hypervisor is ready + and not disabled. + - name: filter_external_customer + description: | + This step prefix-matches the domain name for external customer domains and + filters out hosts that are not intended for external customers. It considers + the `CUSTOM_EXTERNAL_CUSTOMER_SUPPORTED` trait on hosts as well as the + `domain_name` scheduler hint from the nova request spec. + params: + - {key: domainNamePrefixes, stringListValue: ["iaas-"]} + - name: filter_allowed_projects + description: | + This step filters hosts based on allowed projects defined in the + hypervisor resource. Note that hosts allowing all projects are still + accessible and will not be filtered out. In this way some hypervisors + are made accessible to some projects only. + - name: filter_capabilities + description: | + This step will filter out hosts that do not meet the compute capabilities + requested by the nova flavor extra specs, like `{"arch": "x86_64", + "maxphysaddr:bits": 46, ...}`. + + Note: currently, advanced boolean/numeric operators for the capabilities + like `>`, `!`, ... are not supported because they are not used by any of our + flavors in production. + - name: filter_instance_group_affinity + description: | + This step selects hosts in the instance group specified in the nova + scheduler request spec. + - name: filter_instance_group_anti_affinity + description: | + This step selects hosts not in the instance group specified in the nova + scheduler request spec, but only until the max_server_per_host limit is + reached (default = 1). + - name: filter_live_migratable + description: | + This step ensures that the target host of a live migration can accept + the migrating VM, by checking cpu architecture, cpu features, emulated + devices, and cpu modes. + - name: filter_requested_destination + params: {{ .Values.kvm.filterRequestedDestinationParams | toYaml | nindent 8 }} + description: | + This step filters hosts based on the `requested_destination` instruction + from the nova scheduler request spec. It supports filtering by host and + by aggregates. + weighers: + - name: kvm_prefer_smaller_hosts + params: + - {key: resourceWeights, floatMapValue: {"memory": 1.0}} + description: | + This step pulls virtual machines onto smaller hosts (by capacity). This + ensures that larger hosts are not overly fragmented with small VMs, + and can still accommodate larger VMs when they need to be scheduled. + - name: kvm_instance_group_soft_affinity + description: | + This weigher implements the "soft affinity" and "soft anti-affinity" policy + for instance groups in nova. + + It assigns a weight to each host based on how many instances of the same + instance group are already running on that host. The more instances of the + same group on a host, the lower (for soft-anti-affinity) or higher + (for soft-affinity) the weight, which makes it less likely or more likely, + respectively, for the scheduler to choose that host for new instances of + the same group. + - name: kvm_binpack + multiplier: -1.0 # inverted = balancing + params: + - {key: resourceWeights, floatMapValue: {"memory": 1.0}} + description: | + This step implements a balancing weigher for workloads on kvm hypervisors, + which is the opposite of binpacking. Instead of pulling the requested vm + into the smallest gaps possible, it spreads the load to ensure + workloads are balanced across hosts. In this pipeline, the balancing will + focus on general purpose virtual machines. +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Pipeline +metadata: + name: kvm-committed-resource-reservation-hana +spec: + schedulingDomain: nova + description: | + This pipeline is used for placing committed resource (CR) reservations + for HANA workloads. It uses the same comprehensive filtering as regular + VM scheduling to ensure proper placement, considering all existing VMs, + capacity constraints, traits, and other requirements. + + Key difference from regular VM scheduling: reserved capacity is kept + locked (lockReserved: true) to prevent CR reservations from overlapping + with each other, even for the same project. + + This is the pipeline used for KVM hypervisors (qemu and cloud-hypervisor). + Specifically, this pipeline uses bin-packing for HANA workloads to consolidate + them on fewer hosts, leaving larger hosts available for future large VMs. + type: filter-weigher + createDecisions: true + # Fetch all placement candidates, ignoring nova's preselection. + ignorePreselection: true + filters: + - name: filter_host_instructions + description: | + This step will consider the `ignore_hosts` and `force_hosts` instructions + from the nova scheduler request spec to filter out or exclusively allow + certain hosts. + - name: filter_has_enough_capacity + description: | + This step will filter out hosts that do not have enough available capacity + to host the requested flavor. Reserved space is kept locked to avoid + CR reservations overlapping. + params: + - {key: lockReserved, boolValue: true} + - name: filter_has_requested_traits + description: | + This step filters hosts that do not have the requested traits given by the + nova flavor extra spec: "trait:": "forbidden" means the host must + not have the specified trait. "trait:": "required" means the host + must have the specified trait. + - name: filter_has_accelerators + description: | + This step will filter out hosts without the trait `COMPUTE_ACCELERATORS` if + the nova flavor extra specs request accelerators via "accel:device_profile". + - name: filter_correct_az + description: | + This step will filter out hosts whose aggregate information indicates they + are not placed in the requested availability zone. + - name: filter_status_conditions + description: | + This step will filter out hosts for which the hypervisor status conditions + do not meet the expected values, for example, that the hypervisor is ready + and not disabled. + - name: filter_external_customer + description: | + This step prefix-matches the domain name for external customer domains and + filters out hosts that are not intended for external customers. It considers + the `CUSTOM_EXTERNAL_CUSTOMER_SUPPORTED` trait on hosts as well as the + `domain_name` scheduler hint from the nova request spec. + params: + - {key: domainNamePrefixes, stringListValue: ["iaas-"]} + - name: filter_allowed_projects + description: | + This step filters hosts based on allowed projects defined in the + hypervisor resource. Note that hosts allowing all projects are still + accessible and will not be filtered out. In this way some hypervisors + are made accessible to some projects only. + - name: filter_capabilities + description: | + This step will filter out hosts that do not meet the compute capabilities + requested by the nova flavor extra specs, like `{"arch": "x86_64", + "maxphysaddr:bits": 46, ...}`. + + Note: currently, advanced boolean/numeric operators for the capabilities + like `>`, `!`, ... are not supported because they are not used by any of our + flavors in production. + - name: filter_instance_group_affinity + description: | + This step selects hosts in the instance group specified in the nova + scheduler request spec. + - name: filter_instance_group_anti_affinity + description: | + This step selects hosts not in the instance group specified in the nova + scheduler request spec, but only until the max_server_per_host limit is + reached (default = 1). + - name: filter_live_migratable + description: | + This step ensures that the target host of a live migration can accept + the migrating VM, by checking cpu architecture, cpu features, emulated + devices, and cpu modes. + - name: filter_requested_destination + params: {{ .Values.kvm.filterRequestedDestinationParams | toYaml | nindent 8 }} + description: | + This step filters hosts based on the `requested_destination` instruction + from the nova scheduler request spec. It supports filtering by host and + by aggregates. + weighers: + - name: kvm_prefer_smaller_hosts + params: + - {key: resourceWeights, floatMapValue: {"memory": 1.0}} + description: | + This step pulls virtual machines onto smaller hosts (by capacity). This + ensures that larger hosts are not overly fragmented with small VMs, + and can still accommodate larger VMs when they need to be scheduled. + - name: kvm_instance_group_soft_affinity + description: | + This weigher implements the "soft affinity" and "soft anti-affinity" policy + for instance groups in nova. + + It assigns a weight to each host based on how many instances of the same + instance group are already running on that host. The more instances of the + same group on a host, the lower (for soft-anti-affinity) or higher + (for soft-affinity) the weight, which makes it less likely or more likely, + respectively, for the scheduler to choose that host for new instances of + the same group. + - name: kvm_binpack + params: + - {key: resourceWeights, floatMapValue: {"memory": 1.0}} + description: | + This step implements a binpacking weigher for workloads on kvm hypervisors. + It pulls the requested vm into the smallest gaps possible, to ensure + other hosts with less allocation stay free for bigger vms. + In this pipeline, the binpacking will focus on hana virtual machines. {{- end }} diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index b89045619..4be224d36 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -141,6 +141,12 @@ cortex-scheduling-controllers: # Endpoints configuration for reservations controller endpoints: novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" + # FlavorGroupPipelines maps flavor group IDs to pipeline names for CR reservations + # This allows different scheduling strategies per flavor group (e.g., HANA vs GP) + flavorGroupPipelines: + "2152": "kvm-committed-resource-reservation-hana" # HANA flavor group + "2101": "kvm-committed-resource-reservation-general-purpose" # General Purpose flavor group + "*": "kvm-committed-resource-reservation-general-purpose" # Catch-all fallback # OvercommitMappings is a list of mappings that map hypervisor traits to # overcommit ratios. Note that this list is applied in order, so if there # are multiple mappings applying to the same hypervisors, the last mapping diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go index 1c6276ade..d814f91e4 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -16,14 +16,18 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" + "github.com/google/uuid" . "github.com/majewsky/gg/option" "github.com/sapcc/go-api-declarations/liquid" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) +var apiLog = ctrl.Log.WithName("commitment-reservation-api") + // sortedKeys returns map keys sorted alphabetically for deterministic iteration. func sortedKeys[K ~string, V any](m map[K]V) []K { keys := make([]K, 0, len(m)) @@ -43,6 +47,12 @@ func sortedKeys[K ~string, V any](m map[K]V) []K { // This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle. // A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back. func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) { + // Check if API is enabled + if !api.config.EnableChangeCommitmentsAPI { + http.Error(w, "change-commitments API is disabled", http.StatusServiceUnavailable) + return + } + // Serialize all change-commitments requests api.changeMutex.Lock() defer api.changeMutex.Unlock() @@ -50,9 +60,10 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque // Extract or generate request ID for tracing requestID := r.Header.Get("X-Request-ID") if requestID == "" { - requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + requestID = uuid.New().String() } - log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/change-commitments") + ctx := reservations.WithGlobalRequestID(context.Background(), requestID) + logger := APILoggerFromContext(ctx).WithValues("endpoint", "/v1/change-commitments") // Only accept POST method if r.Method != http.MethodPost { @@ -63,12 +74,12 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque // Parse request body var req liquid.CommitmentChangeRequest if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - log.Error(err, "invalid request body") + logger.Error(err, "invalid request body") http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest) return } - log.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ) + logger.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ) // Initialize response resp := liquid.CommitmentChangeResponse{} @@ -76,7 +87,7 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque // Check for dry run -> early reject, not supported yet if req.DryRun { resp.RejectionReason = "Dry run not supported yet" - log.Info("rejecting dry run request") + logger.Info("rejecting dry run request") w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) if err := json.NewEncoder(w).Encode(resp); err != nil { @@ -87,7 +98,7 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque // Process commitment changes // For now, we'll implement a simplified path that checks capacity for immediate start CRs - if err := api.processCommitmentChanges(w, log, req, &resp); err != nil { + if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil { // Error already written to response by processCommitmentChanges return } @@ -100,17 +111,16 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque } } -func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Logger, req liquid.CommitmentChangeRequest, resp *liquid.CommitmentChangeResponse) error { - ctx := context.Background() +func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.ResponseWriter, logger logr.Logger, req liquid.CommitmentChangeRequest, resp *liquid.CommitmentChangeResponse) error { manager := NewReservationManager(api.client) requireRollback := false failedCommitments := make(map[string]string) // commitmentUUID to reason for failure, for better response messages in case of rollback - log.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject)) + logger.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject)) knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { - log.Info("failed to get flavor groups from knowledge extractor", "error", err) + logger.Info("failed to get flavor groups from knowledge extractor", "error", err) resp.RejectionReason = "caches not ready" retryTime := time.Now().Add(1 * time.Minute) resp.RetryAt = Some(retryTime) @@ -124,7 +134,7 @@ func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Log } if req.InfoVersion != currentVersion { - log.Info("version mismatch in commitment change request", + logger.Info("version mismatch in commitment change request", "requestVersion", req.InfoVersion, "currentVersion", currentVersion) http.Error(w, fmt.Sprintf("Version mismatch: request version %d, current version %d. Please refresh and retry.", @@ -163,7 +173,7 @@ ProcessLoop: for _, commitment := range resourceChanges.Commitments { // Additional per-commitment validation if needed - log.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) + logger.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) // TODO add configurable upper limit validation for commitment size (number of instances) to prevent excessive reservation creation // TODO add domain @@ -174,7 +184,7 @@ ProcessLoop: v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, }); err != nil { failedCommitments[string(commitment.UUID)] = "failed to list reservations" - log.Info(fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err)) + logger.Info("failed to list reservations for commitment", "commitmentUUID", commitment.UUID, "error", err) requireRollback = true break ProcessLoop } @@ -200,7 +210,7 @@ ProcessLoop: stateBefore, err = FromReservations(existing_reservations.Items) if err != nil { failedCommitments[string(commitment.UUID)] = "failed to parse existing commitment reservations" - log.Info(fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err)) + logger.Info("failed to get existing state for commitment", "commitmentUUID", commitment.UUID, "error", err) requireRollback = true break ProcessLoop } @@ -210,22 +220,22 @@ ProcessLoop: // get desired state stateDesired, err := FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ)) if err != nil { - failedCommitments[string(commitment.UUID)] = "failed to determine desired commitment state" - log.Info(fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err)) + failedCommitments[string(commitment.UUID)] = err.Error() + logger.Info("failed to get desired state for commitment", "commitmentUUID", commitment.UUID, "error", err) requireRollback = true break ProcessLoop } - log.Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldState", stateBefore, "desiredState", stateDesired) + logger.Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldState", stateBefore, "desiredState", stateDesired) - touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, log, stateDesired, flavorGroups, "changeCommitmentsApi") + touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") if err != nil { failedCommitments[string(commitment.UUID)] = "failed to apply commitment state" - log.Info(fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err)) + logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err) requireRollback = true break ProcessLoop } - log.Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) + logger.Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) reservationsToWatch = append(reservationsToWatch, touchedReservations...) } } @@ -233,12 +243,12 @@ ProcessLoop: // TODO make the rollback defer safe if !requireRollback { - log.Info("applied commitment changes, now watching for reservation readiness", "reservationsToWatch", len(reservationsToWatch)) + logger.Info("applied commitment changes, now watching for reservation readiness", "reservationsToWatch", len(reservationsToWatch)) time_start := time.Now() - if failedReservations, errors := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, api.config.ChangeAPIWatchReservationsTimeout, api.config.ChangeAPIWatchReservationsPollInterval); len(failedReservations) > 0 || len(errors) > 0 { - log.Info("reservations failed to become ready, initiating rollback", + if failedReservations, errors := watchReservationsUntilReady(ctx, logger, api.client, reservationsToWatch, api.config.ChangeAPIWatchReservationsTimeout, api.config.ChangeAPIWatchReservationsPollInterval); len(failedReservations) > 0 || len(errors) > 0 { + logger.Info("reservations failed to become ready, initiating rollback", "failedReservations", len(failedReservations), "errors", errors) @@ -251,7 +261,7 @@ ProcessLoop: requireRollback = true } - log.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds()) + logger.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds()) } if requireRollback { @@ -265,29 +275,29 @@ ProcessLoop: resp.RejectionReason = reasonBuilder.String() } - log.Info("rollback of commitment changes") + logger.Info("rollback of commitment changes") for commitmentUUID, state := range statesBefore { // Rollback to statesBefore for this commitment - log.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) - _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, "changeCommitmentsApiRollback") + logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) + _, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") if err != nil { - log.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) + logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) // continue with best effort rollback for other projects } } - log.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason) + logger.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason) return nil } - log.Info("commitment changes accepted") + logger.Info("commitment changes accepted") return nil } // watchReservationsUntilReady polls until all reservations reach Ready=True or timeout. func watchReservationsUntilReady( ctx context.Context, - log logr.Logger, + logger logr.Logger, k8sClient client.Client, reservations []v1alpha1.Reservation, timeout time.Duration, @@ -310,7 +320,7 @@ func watchReservationsUntilReady( return failedReservations, errors } - allChecked := true + allAreReady := true for _, res := range reservationsToWatch { // Fetch current state @@ -321,9 +331,9 @@ func watchReservationsUntilReady( } if err := k8sClient.Get(ctx, nn, ¤t); err != nil { - allChecked = false + allAreReady = false // Reservation is still in process of being created, or there is a transient error, continue waiting for it - log.V(1).Info("transient error getting reservation, will retry", "reservation", res.Name, "error", err) + logger.V(1).Info("transient error getting reservation, will retry", "reservation", res.Name, "error", err) stillWaiting = append(stillWaiting, res) continue } @@ -336,32 +346,40 @@ func watchReservationsUntilReady( if readyCond == nil { // Condition not set yet, keep waiting - allChecked = false + allAreReady = false stillWaiting = append(stillWaiting, res) continue } switch readyCond.Status { case metav1.ConditionTrue: - // TODO use more than readyCondition + // check if host is not set in spec or status: if so, no capacity left to schedule the reservation + if current.Spec.TargetHost == "" || current.Status.Host == "" { + allAreReady = false + failedReservations = append(failedReservations, current) + logger.Info("insufficient capacity for reservation", "reservation", current.Name, "reason", readyCond.Reason, "message", readyCond.Message, "targetHostInSpec", current.Spec.TargetHost, "hostInStatus", current.Status.Host) + } else { + // Reservation is successfully scheduled, no further action needed + logger.Info("reservation ready", "reservation", current.Name, "host", current.Spec.TargetHost) + } + case metav1.ConditionFalse: - allChecked = false failedReservations = append(failedReservations, res) case metav1.ConditionUnknown: - allChecked = false + allAreReady = false stillWaiting = append(stillWaiting, res) } } - if allChecked || len(stillWaiting) == 0 { - log.Info("all reservations checked", + if allAreReady || len(stillWaiting) == 0 { + logger.Info("all reservations checked", "failed", len(failedReservations)) return failedReservations, errors } reservationsToWatch = stillWaiting // Log progress - log.Info("waiting for reservations to become ready", + logger.V(1).Info("waiting for reservations to become ready", "notReady", len(reservationsToWatch), "total", len(reservations), "timeRemaining", time.Until(deadline).Round(time.Second)) diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go index 871e72b54..06b792c4a 100644 --- a/internal/scheduling/reservations/commitments/api_change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -68,9 +68,33 @@ func TestCommitmentChangeIntegration(t *testing.T) { ExistingReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("ram_hana_1", "project-A", "uuid-456", "confirmed", 3)), AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, - ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "", Flavor: m1Small, ProjectID: "project-A"}}, + ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), }, + { + Name: "Invalid CR name - too long", + VMs: []*TestVM{}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", strings.Repeat("long-", 13), "confirmed", 3), + ), + AvailableResources: &AvailableResources{}, + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment long-long-long-long-long-long-long-long-long-long-long-long-long-: unexpected commitment format"), + }, + { + Name: "Invalid CR name - spaces", + VMs: []*TestVM{}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid with space", "confirmed", 3), + ), + AvailableResources: &AvailableResources{}, + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid with space: unexpected commitment format"), + }, { Name: "Swap capacity between CRs - order dependent - delete-first succeeds", Flavors: []*TestFlavor{m1Small}, @@ -404,6 +428,22 @@ func TestCommitmentChangeIntegration(t *testing.T) { }, EnvInfoVersion: -1, // Skip Knowledge CRD creation }, + { + Name: "API disabled - returns 503 Service Unavailable", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "uuid-disabled", "confirmed", 2), + ), + CustomConfig: func() *Config { + cfg := DefaultConfig() + cfg.EnableChangeCommitmentsAPI = false + return &cfg + }(), + ExpectedReservations: []*TestReservation{}, + ExpectedAPIResponse: APIResponseExpectation{ + StatusCode: 503, + }, + }, { Name: "Multiple commitments insufficient capacity - all listed in error", // Tests that multiple failed commitments are all mentioned in the rejection reason @@ -417,6 +457,30 @@ func TestCommitmentChangeIntegration(t *testing.T) { ExpectedReservations: []*TestReservation{}, ExpectedAPIResponse: newAPIResponse("2 commitment(s) failed", "commitment uuid-multi-fail-1: not sufficient capacity", "commitment uuid-multi-fail-2: not sufficient capacity"), }, + { + Name: "Deletion priority during rollback - unscheduled removed first", + // Tests that during rollback, unscheduled reservations (no TargetHost) are deleted first, + // preserving scheduled reservations (with TargetHost), especially those with VM allocations + VMs: []*TestVM{{UUID: "vm-priority", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Flavors: []*TestFlavor{m1Small}, + ExistingReservations: []*TestReservation{ + // Reservation with VM allocation - should be preserved (lowest deletion priority) + {CommitmentID: "commitment-1", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-priority"}}, + // Scheduled but unused - medium deletion priority + {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("ram_hana_1", "project-A", "commitment-1", "confirmed", 4), + ), + AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 1024}}, + ExpectedReservations: []*TestReservation{ + // After rollback, should preserve the scheduled reservations (especially with VMs) + // and remove unscheduled ones first + {CommitmentID: "commitment-1", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-priority"}}, + {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + }, + ExpectedAPIResponse: newAPIResponse("commitment commitment-1: not sufficient capacity"), + }, { Name: "Watch timeout with custom config - triggers rollback with timeout error", Flavors: []*TestFlavor{m1Small}, @@ -424,10 +488,12 @@ func TestCommitmentChangeIntegration(t *testing.T) { createCommitment("ram_hana_1", "project-A", "uuid-timeout", "confirmed", 2), ), // With 0ms timeout, the watch will timeout immediately before reservations become ready - CustomConfig: &Config{ - ChangeAPIWatchReservationsTimeout: 0 * time.Millisecond, - ChangeAPIWatchReservationsPollInterval: 100 * time.Millisecond, - }, + CustomConfig: func() *Config { + cfg := DefaultConfig() + cfg.ChangeAPIWatchReservationsTimeout = 0 * time.Millisecond + cfg.ChangeAPIWatchReservationsPollInterval = 100 * time.Millisecond + return &cfg + }(), ExpectedReservations: []*TestReservation{}, // Rollback removes all reservations ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"), }, @@ -1121,9 +1187,16 @@ func (env *CommitmentTestEnv) processNewReservation(res *v1alpha1.Reservation) { env.processedReserv[res.Name] = true + if res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.ResourceGroup == "" || res.Spec.Resources == nil || res.Spec.Resources["memory"] == (resource.Quantity{}) { + env.markReservationFailedStatus(res, "invalid reservation spec") + env.T.Logf("✗ Invalid reservation spec for %s: marking as failed (resource group: %s, resources: %v)", res.Name, res.Spec.CommittedResourceReservation.ResourceGroup, res.Spec.Resources) + return + } + // If no available resources configured, accept all reservations without host assignment if env.availableResources == nil { - env.markReservationReady(res) + env.T.Logf("✓ Scheduled reservation %s - no resource tracking, simply accept", res.Name) + env.markReservationSchedulerProcessedStatus(res, "some-host") return } @@ -1167,19 +1240,29 @@ func (env *CommitmentTestEnv) processNewReservation(res *v1alpha1.Reservation) { env.T.Logf("Warning: Failed to update reservation status host: %v", err) } - env.markReservationReady(res) + env.markReservationSchedulerProcessedStatus(res, selectedHost) env.T.Logf("✓ Scheduled reservation %s on %s (%d MB used, %d MB remaining)", res.Name, selectedHost, memoryMB, env.availableResources[selectedHost]) } else { - // FAILURE: No host has enough capacity - env.markReservationFailed(res, "Insufficient capacity on all hosts") + env.markReservationSchedulerProcessedStatus(res, "") env.T.Logf("✗ Failed to schedule reservation %s (needs %d MB, no host has capacity)", res.Name, memoryMB) } } -// markReservationReady updates a reservation to have Ready=True status. -func (env *CommitmentTestEnv) markReservationReady(res *v1alpha1.Reservation) { +// markReservationSchedulerProcessedStatus updates a reservation to have Ready=True status (scheduling can be succeeded or not - depending on host status) +func (env *CommitmentTestEnv) markReservationSchedulerProcessedStatus(res *v1alpha1.Reservation, host string) { + ctx := context.Background() + + // Update spec first + res.Spec.TargetHost = host + if err := env.K8sClient.Update(ctx, res); err != nil { + env.T.Logf("Warning: Failed to update reservation spec: %v", err) + return + } + + // Then update status + res.Status.Host = host res.Status.Conditions = []metav1.Condition{ { Type: v1alpha1.ReservationConditionReady, @@ -1189,20 +1272,18 @@ func (env *CommitmentTestEnv) markReservationReady(res *v1alpha1.Reservation) { LastTransitionTime: metav1.Now(), }, } - - if err := env.K8sClient.Status().Update(context.Background(), res); err != nil { - // Ignore errors - might be deleted during update - return + if err := env.K8sClient.Status().Update(ctx, res); err != nil { + env.T.Logf("Warning: Failed to update reservation status: %v", err) } } -// markReservationFailed updates a reservation to have Ready=False status (scheduling failed). -func (env *CommitmentTestEnv) markReservationFailed(res *v1alpha1.Reservation, reason string) { +// markReservationFailedStatus updates a reservation to have Ready=False status +func (env *CommitmentTestEnv) markReservationFailedStatus(res *v1alpha1.Reservation, reason string) { res.Status.Conditions = []metav1.Condition{ { Type: v1alpha1.ReservationConditionReady, Status: metav1.ConditionFalse, - Reason: "SchedulingFailed", + Reason: "Reservation invalid", Message: reason, LastTransitionTime: metav1.Now(), }, diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index 95dc904d8..94a4b80b1 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -3,20 +3,64 @@ package commitments -import "time" +import ( + "time" + + corev1 "k8s.io/api/core/v1" +) -// Config defines the configuration for the commitments HTTP API. type Config struct { - // how long to wait for reservations to become ready before timing out and rolling back. + + // RequeueIntervalActive is the interval for requeueing active reservations for verification. + RequeueIntervalActive time.Duration `json:"requeueIntervalActive"` + // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready. + RequeueIntervalRetry time.Duration `json:"requeueIntervalRetry"` + // PipelineDefault is the default pipeline used for scheduling committed resource reservations. + PipelineDefault string `json:"pipelineDefault"` + + // The endpoint where to find the nova external scheduler endpoint. + Endpoints EndpointsConfig `json:"endpoints"` + + // Secret ref to SSO credentials stored in a k8s secret, if applicable. + SSOSecretRef *corev1.SecretReference `json:"ssoSecretRef"` + + // Secret ref to keystone credentials stored in a k8s secret. + KeystoneSecretRef corev1.SecretReference `json:"keystoneSecretRef"` + + // Secret ref to the database credentials for querying VM state. + DatabaseSecretRef *corev1.SecretReference `json:"databaseSecretRef,omitempty"` + + // FlavorGroupPipelines maps flavor group names to pipeline names. + // Example: {"2152": "kvm-commitments-hana", "2101": "kvm-commitments-gp", "*": "kvm-commitments-gp"} + // Used to select different scheduling pipelines based on flavor group characteristics. + FlavorGroupPipelines map[string]string `json:"flavorGroupPipelines,omitempty"` + + // API configuration + + // ChangeAPIWatchReservationsTimeout defines how long to wait for reservations to become ready before timing out and rolling back. ChangeAPIWatchReservationsTimeout time.Duration `json:"changeAPIWatchReservationsTimeout"` - // how frequently to poll reservation status during watch. + // ChangeAPIWatchReservationsPollInterval defines how frequently to poll reservation status during watch. ChangeAPIWatchReservationsPollInterval time.Duration `json:"changeAPIWatchReservationsPollInterval"` + + // EnableChangeCommitmentsAPI controls whether the change-commitments API endpoint is active. + // When false, the endpoint will return HTTP 503 Service Unavailable. + // The info endpoint remains available for health checks. + EnableChangeCommitmentsAPI bool `json:"enableChangeCommitmentsAPI"` +} + +type EndpointsConfig struct { + // The nova external scheduler endpoint. + NovaExternalScheduler string `json:"novaExternalScheduler"` } func DefaultConfig() Config { return Config{ - ChangeAPIWatchReservationsTimeout: 2 * time.Second, - ChangeAPIWatchReservationsPollInterval: 100 * time.Millisecond, + RequeueIntervalActive: 5 * time.Minute, + RequeueIntervalRetry: 1 * time.Minute, + PipelineDefault: "kvm-committed-resource-reservation-general-purpose", + ChangeAPIWatchReservationsTimeout: 10 * time.Second, + ChangeAPIWatchReservationsPollInterval: 500 * time.Millisecond, + EnableChangeCommitmentsAPI: true, } } diff --git a/internal/scheduling/reservations/commitments/context.go b/internal/scheduling/reservations/commitments/context.go new file mode 100644 index 000000000..10dc386ac --- /dev/null +++ b/internal/scheduling/reservations/commitments/context.go @@ -0,0 +1,25 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" +) + +func LoggerFromContext(ctx context.Context) logr.Logger { + return commitmentLog.WithValues( + "greq", reservations.GlobalRequestIDFromContext(ctx), + "req", reservations.RequestIDFromContext(ctx), + ) +} + +func APILoggerFromContext(ctx context.Context) logr.Logger { + return apiLog.WithValues( + "greq", reservations.GlobalRequestIDFromContext(ctx), + "req", reservations.RequestIDFromContext(ctx), + ) +} diff --git a/internal/scheduling/reservations/controller/controller.go b/internal/scheduling/reservations/commitments/controller.go similarity index 61% rename from internal/scheduling/reservations/controller/controller.go rename to internal/scheduling/reservations/commitments/controller.go index 20580bdff..611d6e1f6 100644 --- a/internal/scheduling/reservations/controller/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -1,19 +1,15 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package controller +package commitments import ( "context" - "encoding/json" "errors" "fmt" - "net/http" - "strings" "time" "k8s.io/apimachinery/pkg/api/meta" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" @@ -32,39 +28,13 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/cobaltcore-dev/cortex/pkg/multicluster" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/handler" + "github.com/go-logr/logr" ) -const ( - // RequeueIntervalActive is the interval for requeueing active reservations for verification. - RequeueIntervalActive = 5 * time.Minute - // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready. - RequeueIntervalRetry = 1 * time.Minute -) - -// Endpoints for the reservations operator. -type EndpointsConfig struct { - // The nova external scheduler endpoint. - NovaExternalScheduler string `json:"novaExternalScheduler"` -} - -type Config struct { - // The endpoint where to find the nova external scheduler endpoint. - Endpoints EndpointsConfig `json:"endpoints"` +var commitmentLog = ctrl.Log.WithName("commitment-reservation-controller") - // Secret ref to SSO credentials stored in a k8s secret, if applicable. - SSOSecretRef *corev1.SecretReference `json:"ssoSecretRef"` - - // Secret ref to keystone credentials stored in a k8s secret. - KeystoneSecretRef corev1.SecretReference `json:"keystoneSecretRef"` - - // Secret ref to the database credentials for querying VM state. - DatabaseSecretRef *corev1.SecretReference `json:"databaseSecretRef,omitempty"` -} - -// ReservationReconciler reconciles a Reservation object -type ReservationReconciler struct { +// CommitmentReservationController reconciles commitment Reservation objects +type CommitmentReservationController struct { // Client for the kubernetes API. client.Client // Kubernetes scheme to use for the reservations. @@ -73,14 +43,15 @@ type ReservationReconciler struct { Conf Config // Database connection for querying VM state from Knowledge cache. DB *db.DB + // SchedulerClient for making scheduler API calls. + SchedulerClient *reservations.SchedulerClient } // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. -// Note: Failover reservations are filtered out at the watch level by the predicate -// in SetupWithManager, so this function only handles non-failover reservations. -func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - log := logf.FromContext(ctx) +// Note: This controller only handles commitment reservations, as filtered by the predicate. +func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := LoggerFromContext(ctx).WithValues("reservation", req.Name, "namespace", req.Namespace) // Fetch the reservation object. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { @@ -89,16 +60,16 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { - log.Info("reservation is active, verifying allocations", "reservation", req.Name) + logger.Info("reservation is active, verifying allocations") // Verify all allocations in Spec against actual VM state from database if err := r.reconcileAllocations(ctx, &res); err != nil { - log.Error(err, "failed to reconcile allocations") + logger.Error(err, "failed to reconcile allocations") return ctrl.Result{}, err } // Requeue periodically to keep verifying allocations - return ctrl.Result{RequeueAfter: RequeueIntervalActive}, nil + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalActive}, nil } // TODO trigger re-placement of unused reservations over time @@ -108,8 +79,7 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) len(res.Spec.CommittedResourceReservation.Allocations) > 0 && res.Spec.TargetHost != "" { // mark as ready without calling the placement API - log.Info("detected pre-allocated reservation", - "reservation", req.Name, + logger.Info("detected pre-allocated reservation", "targetHost", res.Spec.TargetHost, "allocatedVMs", len(res.Spec.CommittedResourceReservation.Allocations)) @@ -125,14 +95,14 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion if client.IgnoreNotFound(err) != nil { - log.Error(err, "failed to patch pre-allocated reservation status") + logger.Error(err, "failed to patch pre-allocated reservation status") return ctrl.Result{}, err } // Object was deleted, no need to continue return ctrl.Result{}, nil } - log.Info("marked pre-allocated reservation as ready", "reservation", req.Name, "host", res.Status.Host) + logger.Info("marked pre-allocated reservation as ready", "host", res.Status.Host) // Requeue immediately to run verification in next reconcile loop return ctrl.Result{Requeue: true}, nil } @@ -150,13 +120,13 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion if client.IgnoreNotFound(err) != nil { - log.Error(err, "failed to sync spec to status") + logger.Error(err, "failed to sync spec to status") return ctrl.Result{}, err } // Object was deleted, no need to continue return ctrl.Result{}, nil } - log.Info("synced spec to status", "reservation", req.Name, "host", res.Status.Host) + logger.Info("synced spec to status", "host", res.Status.Host) } // filter for CR reservations @@ -165,7 +135,7 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) resourceName = res.Spec.CommittedResourceReservation.ResourceName } if resourceName == "" { - log.Info("reservation has no resource name, skipping", "reservation", req.Name) + logger.Info("reservation has no resource name, skipping") old := res.DeepCopy() meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ Type: v1alpha1.ReservationConditionReady, @@ -177,7 +147,7 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion if client.IgnoreNotFound(err) != nil { - log.Error(err, "failed to patch reservation status") + logger.Error(err, "failed to patch reservation status") return ctrl.Result{}, err } // Object was deleted, no need to continue @@ -186,25 +156,6 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil // Don't need to requeue. } - // Convert resource.Quantity to integers for the API - var memoryMB uint64 - if memory, ok := res.Spec.Resources[hv1.ResourceMemory]; ok { - memoryValue := memory.ScaledValue(resource.Mega) - if memoryValue < 0 { - return ctrl.Result{}, fmt.Errorf("invalid memory value: %d", memoryValue) - } - memoryMB = uint64(memoryValue) - } - - var cpu uint64 - if cpuQuantity, ok := res.Spec.Resources[hv1.ResourceCPU]; ok { - cpuValue := cpuQuantity.ScaledValue(resource.Milli) - if cpuValue < 0 { - return ctrl.Result{}, fmt.Errorf("invalid cpu value: %d", cpuValue) - } - cpu = uint64(cpuValue) - } - // Get project ID from CommittedResourceReservation spec if available. projectID := "" if res.Spec.CommittedResourceReservation != nil { @@ -221,18 +172,21 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { - log.Info("flavor knowledge not ready, requeueing", + logger.Info("flavor knowledge not ready, requeueing", "resourceName", resourceName, "error", err) - return ctrl.Result{RequeueAfter: RequeueIntervalRetry}, nil + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, nil } // Search for the flavor across all flavor groups + // Also capture the flavor group name for pipeline selection var flavorDetails *compute.FlavorInGroup - for _, fg := range flavorGroups { + var flavorGroupName string + for groupName, fg := range flavorGroups { for _, flavor := range fg.Flavors { if flavor.Name == resourceName { flavorDetails = &flavor + flavorGroupName = groupName break } } @@ -243,67 +197,70 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Check if flavor was found if flavorDetails == nil { - log.Error(errors.New("flavor not found"), "flavor not found in any flavor group", + logger.Error(errors.New("flavor not found"), "flavor not found in any flavor group", "resourceName", resourceName) return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil } - // Call the external scheduler delegation API to get a host for the reservation. - // Cortex will fetch candidate hosts and weights itself from its knowledge state. - externalSchedulerRequest := schedulerdelegationapi.ExternalSchedulerRequest{ - Reservation: true, - Spec: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaSpec]{ - Data: schedulerdelegationapi.NovaSpec{ - InstanceUUID: res.Name, - NumInstances: 1, // One for each reservation. - ProjectID: projectID, - AvailabilityZone: availabilityZone, - Flavor: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaFlavor]{ - Data: schedulerdelegationapi.NovaFlavor{ - Name: flavorDetails.Name, - MemoryMB: memoryMB, // take the memory from the reservation spec, not from the flavor - reservation might be bigger - VCPUs: cpu, // take the cpu from the reservation spec, not from the flavor - reservation might be bigger - ExtraSpecs: flavorDetails.ExtraSpecs, - // Disk is currently not considered. - - }, - }, - }, - }, - } - httpClient := http.Client{} - url := r.Conf.Endpoints.NovaExternalScheduler - reqBody, err := json.Marshal(externalSchedulerRequest) - if err != nil { - log.Error(err, "failed to marshal external scheduler request") + // Get hypervisors from the cluster + var hypervisorList hv1.HypervisorList + if err := r.List(ctx, &hypervisorList); err != nil { + logger.Error(err, "failed to list hypervisors") return ctrl.Result{}, err } - response, err := httpClient.Post(url, "application/json", strings.NewReader(string(reqBody))) - if err != nil { - log.Error(err, "failed to send external scheduler request", "url", url) - return ctrl.Result{}, err - } - defer response.Body.Close() - // Check HTTP status code before attempting to decode JSON - if response.StatusCode != http.StatusOK { - err := fmt.Errorf("unexpected HTTP status code: %d", response.StatusCode) - log.Error(err, "external scheduler returned non-OK status", - "url", url, - "statusCode", response.StatusCode, - "status", response.Status) - return ctrl.Result{}, err + // Build list of eligible hosts + eligibleHosts := make([]schedulerdelegationapi.ExternalSchedulerHost, 0, len(hypervisorList.Items)) + for _, hv := range hypervisorList.Items { + eligibleHosts = append(eligibleHosts, schedulerdelegationapi.ExternalSchedulerHost{ + ComputeHost: hv.Name, + }) } - var externalSchedulerResponse schedulerdelegationapi.ExternalSchedulerResponse - if err := json.NewDecoder(response.Body).Decode(&externalSchedulerResponse); err != nil { - log.Error(err, "failed to decode external scheduler response", - "url", url, - "statusCode", response.StatusCode) + if len(eligibleHosts) == 0 { + logger.Info("no hypervisors available for scheduling") + old := res.DeepCopy() + meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionFalse, + Reason: "NoHostsAvailable", + Message: "no hypervisors available for scheduling", + }) + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, &res, patch); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, nil + } + + // Select appropriate pipeline based on flavor group + pipelineName := r.getPipelineForFlavorGroup(flavorGroupName, logger) + logger.Info("selected pipeline for CR reservation", + "flavorName", resourceName, + "flavorGroup", flavorGroupName, + "pipeline", pipelineName) + + // Use the SchedulerClient to schedule the reservation + scheduleReq := reservations.ScheduleReservationRequest{ + InstanceUUID: res.Name, + ProjectID: projectID, + FlavorName: flavorDetails.Name, + FlavorExtraSpecs: flavorDetails.ExtraSpecs, + MemoryMB: flavorDetails.MemoryMB, + VCPUs: flavorDetails.VCPUs, + EligibleHosts: eligibleHosts, + Pipeline: pipelineName, + AvailabilityZone: availabilityZone, + } + + scheduleResp, err := r.SchedulerClient.ScheduleReservation(ctx, scheduleReq) + if err != nil { + logger.Error(err, "failed to schedule reservation") return ctrl.Result{}, err } - if len(externalSchedulerResponse.Hosts) == 0 { - log.Info("no hosts found for reservation", "reservation", req.Name) + + if len(scheduleResp.Hosts) == 0 { + logger.Info("no hosts found for reservation") old := res.DeepCopy() meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ Type: v1alpha1.ReservationConditionReady, @@ -315,7 +272,7 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion if client.IgnoreNotFound(err) != nil { - log.Error(err, "failed to patch reservation status") + logger.Error(err, "failed to patch reservation status") return ctrl.Result{}, err } // Object was deleted, no need to continue @@ -325,9 +282,24 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Update the reservation with the found host (idx 0) - host := externalSchedulerResponse.Hosts[0] - log.Info("found host for reservation", "reservation", req.Name, "host", host) + host := scheduleResp.Hosts[0] + logger.Info("found host for reservation", "host", host) + + // First update Spec old := res.DeepCopy() + res.Spec.TargetHost = host + if err := r.Patch(ctx, &res, client.MergeFrom(old)); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + logger.Error(err, "failed to patch reservation spec") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil + } + + // Then update Status + old = res.DeepCopy() meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ Type: v1alpha1.ReservationConditionReady, Status: metav1.ConditionTrue, @@ -339,7 +311,7 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion if client.IgnoreNotFound(err) != nil { - log.Error(err, "failed to patch reservation status") + logger.Error(err, "failed to patch reservation status") return ctrl.Result{}, err } // Object was deleted, no need to continue @@ -350,8 +322,8 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) // reconcileAllocations verifies all allocations in Spec against actual Nova VM state. // It updates Status.Allocations based on the actual host location of each VM. -func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error { - log := logf.FromContext(ctx) +func (r *CommitmentReservationController) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error { + logger := LoggerFromContext(ctx) // Skip if no CommittedResourceReservation if res.Spec.CommittedResourceReservation == nil { @@ -362,7 +334,7 @@ func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v // Skip if no allocations to verify if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { - log.Info("no allocations to verify", "reservation", res.Name) + logger.Info("no allocations to verify", "reservation", res.Name) return nil } @@ -388,14 +360,14 @@ func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v actualHost := server.OSEXTSRVATTRHost newStatusAllocations[vmUUID] = actualHost - log.Info("verified VM allocation", + logger.Info("verified VM allocation", "vm", vmUUID, "reservation", res.Name, "actualHost", actualHost, "expectedHost", res.Status.Host) } else { // VM not found in database - log.Info("VM not found in database", + logger.Info("VM not found in database", "vm", vmUUID, "reservation", res.Name, "projectID", projectID) @@ -420,7 +392,7 @@ func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v return fmt.Errorf("failed to patch reservation status: %w", err) } - log.Info("reconciled allocations", + logger.Info("reconciled allocations", "reservation", res.Name, "specAllocations", len(res.Spec.CommittedResourceReservation.Allocations), "statusAllocations", len(newStatusAllocations)) @@ -428,8 +400,24 @@ func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v return nil } +// getPipelineForFlavorGroup returns the pipeline name for a given flavor group. +func (r *CommitmentReservationController) getPipelineForFlavorGroup(flavorGroupName string, logger logr.Logger) string { + // Try exact match first (e.g., "2152" -> "kvm-cr-hana") + if pipeline, ok := r.Conf.FlavorGroupPipelines[flavorGroupName]; ok { + return pipeline + } + + // Try wildcard fallback + if pipeline, ok := r.Conf.FlavorGroupPipelines["*"]; ok { + return pipeline + } + + logger.Info("no pipeline configured for flavor group, using default", "flavorGroup", flavorGroupName, "defaultPipeline", r.Conf.PipelineDefault) + return r.Conf.PipelineDefault +} + // Init initializes the reconciler with required clients and DB connection. -func (r *ReservationReconciler) Init(ctx context.Context, client client.Client, conf Config) error { +func (r *CommitmentReservationController) Init(ctx context.Context, client client.Client, conf Config) error { // Initialize database connection if DatabaseSecretRef is provided. if conf.DatabaseSecretRef != nil { var err error @@ -437,13 +425,17 @@ func (r *ReservationReconciler) Init(ctx context.Context, client client.Client, if err != nil { return fmt.Errorf("failed to initialize database connection: %w", err) } - logf.FromContext(ctx).Info("database connection initialized for reservation controller") + logf.FromContext(ctx).Info("database connection initialized for commitment reservation controller") } + // Initialize scheduler client + r.SchedulerClient = reservations.NewSchedulerClient(conf.Endpoints.NovaExternalScheduler) + logf.FromContext(ctx).Info("scheduler client initialized for commitment reservation controller", "url", conf.Endpoints.NovaExternalScheduler) + return nil } -func (r *ReservationReconciler) listServersByProjectID(ctx context.Context, projectID string) (map[string]*nova.Server, error) { +func (r *CommitmentReservationController) listServersByProjectID(ctx context.Context, projectID string) (map[string]*nova.Server, error) { if r.DB == nil { return nil, errors.New("database connection not initialized") } @@ -472,43 +464,42 @@ func (r *ReservationReconciler) listServersByProjectID(ctx context.Context, proj return serverMap, nil } -// notFailoverReservationPredicate filters out failover reservations at the watch level. -// This prevents the controller from being notified about failover reservations, -// which are managed by the separate failover controller. -// Failover reservations are identified by the label v1alpha1.LabelReservationType. -var notFailoverReservationPredicate = predicate.Funcs{ +// commitmentReservationPredicate filters to only watch commitment reservations. +// This controller explicitly handles only commitment reservations (CR reservations), +// while failover reservations are handled by the separate failover controller. +var commitmentReservationPredicate = predicate.Funcs{ CreateFunc: func(e event.CreateEvent) bool { res, ok := e.Object.(*v1alpha1.Reservation) if !ok { return false } - return res.Labels[v1alpha1.LabelReservationType] != v1alpha1.ReservationTypeLabelFailover + return res.Spec.Type == v1alpha1.ReservationTypeCommittedResource }, UpdateFunc: func(e event.UpdateEvent) bool { res, ok := e.ObjectNew.(*v1alpha1.Reservation) if !ok { return false } - return res.Labels[v1alpha1.LabelReservationType] != v1alpha1.ReservationTypeLabelFailover + return res.Spec.Type == v1alpha1.ReservationTypeCommittedResource }, DeleteFunc: func(e event.DeleteEvent) bool { res, ok := e.Object.(*v1alpha1.Reservation) if !ok { return false } - return res.Labels[v1alpha1.LabelReservationType] != v1alpha1.ReservationTypeLabelFailover + return res.Spec.Type == v1alpha1.ReservationTypeCommittedResource }, GenericFunc: func(e event.GenericEvent) bool { res, ok := e.Object.(*v1alpha1.Reservation) if !ok { return false } - return res.Labels[v1alpha1.LabelReservationType] != v1alpha1.ReservationTypeLabelFailover + return res.Spec.Type == v1alpha1.ReservationTypeCommittedResource }, } // SetupWithManager sets up the controller with the Manager. -func (r *ReservationReconciler) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { +func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { if err := r.Init(ctx, mgr.GetClient(), r.Conf); err != nil { return err @@ -517,17 +508,10 @@ func (r *ReservationReconciler) SetupWithManager(mgr ctrl.Manager, mcl *multiclu })); err != nil { return err } - bldr := multicluster.BuildController(mcl, mgr) - // Watch reservation changes across all clusters. - bldr, err := bldr.WatchesMulticluster( - &v1alpha1.Reservation{}, - &handler.EnqueueRequestForObject{}, - notFailoverReservationPredicate, - ) - if err != nil { - return err - } - return bldr.Named("reservation"). + return multicluster.BuildController(mcl, mgr). + For(&v1alpha1.Reservation{}). + WithEventFilter(commitmentReservationPredicate). + Named("commitment-reservation"). WithOptions(controller.Options{ // We want to process reservations one at a time to avoid overbooking. MaxConcurrentReconciles: 1, diff --git a/internal/scheduling/reservations/controller/controller_test.go b/internal/scheduling/reservations/commitments/controller_test.go similarity index 86% rename from internal/scheduling/reservations/controller/controller_test.go rename to internal/scheduling/reservations/commitments/controller_test.go index 0ef3e253c..8ef4b3b95 100644 --- a/internal/scheduling/reservations/controller/controller_test.go +++ b/internal/scheduling/reservations/commitments/controller_test.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package controller +package commitments import ( "context" @@ -9,6 +9,7 @@ import ( "net/http" "net/http/httptest" "testing" + "time" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/meta" @@ -23,11 +24,14 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" ) -func TestReservationReconciler_Reconcile(t *testing.T) { +func TestCommitmentReservationController_Reconcile(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add scheme: %v", err) } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add hypervisor scheme: %v", err) + } tests := []struct { name string @@ -84,9 +88,12 @@ func TestReservationReconciler_Reconcile(t *testing.T) { WithStatusSubresource(&v1alpha1.Reservation{}). Build() - reconciler := &ReservationReconciler{ + reconciler := &CommitmentReservationController{ Client: client, Scheme: scheme, + Conf: Config{ + RequeueIntervalActive: 5 * time.Minute, + }, } req := ctrl.Request{ @@ -133,11 +140,14 @@ func TestReservationReconciler_Reconcile(t *testing.T) { } } -func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T) { +func TestCommitmentReservationController_reconcileInstanceReservation_Success(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add scheme: %v", err) } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add hypervisor scheme: %v", err) + } reservation := &v1alpha1.Reservation{ ObjectMeta: ctrl.ObjectMeta{ @@ -157,7 +167,6 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T } // Create flavor group knowledge CRD for the test - // Need to import compute package for FlavorGroupFeature flavorGroups := []struct { Name string `json:"name"` Flavors []struct { @@ -200,7 +209,7 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T Spec: v1alpha1.KnowledgeSpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, Extractor: v1alpha1.KnowledgeExtractorSpec{ - Name: "flavor_groups", // Note: underscore not hyphen + Name: "flavor_groups", }, Recency: metav1.Duration{Duration: 0}, }, @@ -217,9 +226,23 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T }, } + // Create mock hypervisors + hypervisor1 := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-host-1", + }, + Spec: hv1.HypervisorSpec{}, + } + hypervisor2 := &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-host-2", + }, + Spec: hv1.HypervisorSpec{}, + } + client := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(reservation, flavorGroupKnowledge). + WithObjects(reservation, flavorGroupKnowledge, hypervisor1, hypervisor2). WithStatusSubresource(&v1alpha1.Reservation{}, &v1alpha1.Knowledge{}). Build() @@ -254,12 +277,17 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T }, } - reconciler := &ReservationReconciler{ + reconciler := &CommitmentReservationController{ Client: client, Scheme: scheme, Conf: config, } + // Initialize the reconciler (this sets up SchedulerClient) + if err := reconciler.Init(context.Background(), client, config); err != nil { + t.Fatalf("Failed to initialize reconciler: %v", err) + } + req := ctrl.Request{ NamespacedName: types.NamespacedName{ Name: reservation.Name, diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 21ee1fee1..773f74122 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -119,15 +119,24 @@ func (m *ReservationManager) ApplyCommitmentState( // Phase 4 (DELETE): Remove reservations (capacity decreased) for deltaMemoryBytes < 0 && len(existing) > 0 { - // prefer unused reservation slot or simply remove last one + // prefer ones that are not scheduled, or alternatively, unused reservation slot, or simply remove last one var reservationToDelete *v1alpha1.Reservation for i, res := range existing { - if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + if res.Spec.TargetHost == "" { reservationToDelete = &res existing = append(existing[:i], existing[i+1:]...) // remove from existing list break } } + if reservationToDelete == nil { + for i, res := range existing { + if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + reservationToDelete = &res + existing = append(existing[:i], existing[i+1:]...) // remove from existing list + break + } + } + } if reservationToDelete == nil { reservationToDelete = &existing[len(existing)-1] existing = existing[:len(existing)-1] // remove from existing list @@ -160,6 +169,7 @@ func (m *ReservationManager) ApplyCommitmentState( log.Info("creating reservation", "commitmentUUID", desiredState.CommitmentUUID, "deltaMemoryBytes", deltaMemoryBytes, + "flavorName", reservation.Spec.CommittedResourceReservation.ResourceName, "name", reservation.Name, "memoryBytes", memValue.Value()) @@ -200,7 +210,7 @@ func (m *ReservationManager) ApplyCommitmentState( // syncReservationMetadata updates reservation metadata if it differs from desired state. func (m *ReservationManager) syncReservationMetadata( ctx context.Context, - log logr.Logger, + logger logr.Logger, reservation *v1alpha1.Reservation, state *CommitmentState, ) (*v1alpha1.Reservation, error) { @@ -211,7 +221,7 @@ func (m *ReservationManager) syncReservationMetadata( (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) { // Apply patch - log.Info("syncing reservation metadata", + logger.Info("syncing reservation metadata", "reservation", reservation, "desired commitmentUUID", state.CommitmentUUID, "desired availabilityZone", state.AvailabilityZone, diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index 8022999fb..2f20f33cc 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -14,6 +14,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -178,110 +179,338 @@ func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { } } -func TestApplyCommitmentState_PreservesAllocatedReservations(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatal(err) - } - - // Create reservations: one with allocation, one without - existingReservations := []v1alpha1.Reservation{ +func TestApplyCommitmentState_DeletionPriority(t *testing.T) { + tests := []struct { + name string + existingReservations []v1alpha1.Reservation + desiredMemoryBytes int64 + expectedRemovedCount int + validateRemoved func(t *testing.T, removed []v1alpha1.Reservation) + validateRemaining func(t *testing.T, remaining []v1alpha1.Reservation) + }{ { - ObjectMeta: metav1.ObjectMeta{ - Name: "commitment-abc123-0", - Labels: map[string]string{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + name: "Priority 1: Unscheduled reservations (no TargetHost) deleted first", + existingReservations: []v1alpha1.Reservation{ + // Reservation 0: Has TargetHost and allocations - lowest priority (should remain) + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "host-1", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-123": {}, + }, + }, + }, + }, + // Reservation 1: No TargetHost and no allocations - highest priority (should be deleted) + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, }, }, - Spec: v1alpha1.ReservationSpec{ - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + desiredMemoryBytes: 8 * 1024 * 1024 * 1024, // Need to delete one + expectedRemovedCount: 1, + validateRemoved: func(t *testing.T, removed []v1alpha1.Reservation) { + // Should have removed the unscheduled one (no TargetHost) + if removed[0].Spec.TargetHost != "" { + t.Errorf("expected unscheduled reservation to be removed, but removed %s with TargetHost %s", + removed[0].Name, removed[0].Spec.TargetHost) + } + }, + validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) { + if len(remaining) != 1 { + t.Fatalf("expected 1 remaining reservation, got %d", len(remaining)) + } + // Should have kept the scheduled one with allocations + if remaining[0].Spec.TargetHost == "" { + t.Error("expected scheduled reservation to remain") + } + if len(remaining[0].Spec.CommittedResourceReservation.Allocations) == 0 { + t.Error("expected reservation with allocations to remain") + } + }, + }, + { + name: "Priority 2: Unused scheduled reservations (no allocations) deleted next", + existingReservations: []v1alpha1.Reservation{ + // Has TargetHost AND allocations - lowest priority for deletion + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "host-1", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-123": {}, + }, + }, + }, }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: "project-1", - ResourceGroup: "test-group", - Creator: "syncer", - Allocations: map[string]v1alpha1.CommittedResourceAllocation{ - "vm-123": {}, // Has allocation + // Has TargetHost but NO allocations - medium priority + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "host-2", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, }, }, }, + desiredMemoryBytes: 8 * 1024 * 1024 * 1024, + expectedRemovedCount: 1, + validateRemoved: func(t *testing.T, removed []v1alpha1.Reservation) { + // Should have removed the one without allocations + if len(removed[0].Spec.CommittedResourceReservation.Allocations) != 0 { + t.Error("expected reservation without allocations to be removed") + } + }, + validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) { + if len(remaining) != 1 { + t.Fatalf("expected 1 remaining reservation, got %d", len(remaining)) + } + // Should have kept the one with allocations + if len(remaining[0].Spec.CommittedResourceReservation.Allocations) == 0 { + t.Error("expected reservation with allocations to remain") + } + }, }, { - ObjectMeta: metav1.ObjectMeta{ - Name: "commitment-abc123-1", - Labels: map[string]string{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + name: "Mixed scenario: comprehensive deletion priority test", + existingReservations: []v1alpha1.Reservation{ + // Reservation 0: Has TargetHost + has allocations (lowest priority - should remain) + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "host-1", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-allocated": {}, + }, + }, + }, }, - }, - Spec: v1alpha1.ReservationSpec{ - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + // Reservation 1: Has TargetHost + no allocations (medium priority - should remain) + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "host-2", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: "project-1", - ResourceGroup: "test-group", - Creator: "syncer", - Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, // No allocation + // Reservation 2: No TargetHost + no allocations (highest priority - should be deleted) + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-2", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + }, + // Reservation 3: No TargetHost + no allocations (highest priority - should be deleted) + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-3", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + TargetHost: "", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, }, }, + desiredMemoryBytes: 16 * 1024 * 1024 * 1024, // Need to delete 2 out of 4 + expectedRemovedCount: 2, + validateRemoved: func(t *testing.T, removed []v1alpha1.Reservation) { + // Both removed should have no TargetHost (highest priority for deletion) + for _, res := range removed { + if res.Spec.TargetHost != "" { + t.Errorf("expected unscheduled reservations to be removed first, but removed %s with TargetHost %s", + res.Name, res.Spec.TargetHost) + } + } + }, + validateRemaining: func(t *testing.T, remaining []v1alpha1.Reservation) { + if len(remaining) != 2 { + t.Fatalf("expected 2 remaining reservations, got %d", len(remaining)) + } + // Both remaining should have TargetHost + for _, res := range remaining { + if res.Spec.TargetHost == "" { + t.Errorf("expected scheduled reservations to remain, but %s has no TargetHost", res.Name) + } + } + // At least one should have allocations (the one with lowest deletion priority) + hasAllocations := false + for _, res := range remaining { + if len(res.Spec.CommittedResourceReservation.Allocations) > 0 { + hasAllocations = true + break + } + } + if !hasAllocations { + t.Error("expected at least one remaining reservation to have allocations") + } + }, }, } - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(&existingReservations[0], &existingReservations[1]). - Build() + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } - manager := NewReservationManager(client) - flavorGroup := testFlavorGroup() - flavorGroups := map[string]compute.FlavorGroupFeature{ - "test-group": flavorGroup, - } + // Convert slice to individual objects for WithObjects + objects := make([]client.Object, len(tt.existingReservations)) + for i := range tt.existingReservations { + objects[i] = &tt.existingReservations[i] + } - // Desired state: only 16 GiB (need to reduce by one slot) - desiredState := &CommitmentState{ - CommitmentUUID: "abc123", - ProjectID: "project-1", - FlavorGroupName: "test-group", - TotalMemoryBytes: 16 * 1024 * 1024 * 1024, - } + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + Build() - _, removed, err := manager.ApplyCommitmentState( - context.Background(), - logr.Discard(), - desiredState, - flavorGroups, - "syncer", - ) + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } - if err != nil { - t.Fatalf("unexpected error: %v", err) - } + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: tt.desiredMemoryBytes, + } - // Should remove the unallocated reservation, not the allocated one - if len(removed) != 1 { - t.Fatalf("expected 1 removed reservation, got %d", len(removed)) - } + _, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) - // Verify the removed one had no allocations - if len(removed[0].Spec.CommittedResourceReservation.Allocations) != 0 { - t.Error("expected unallocated reservation to be removed first") - } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } - // Verify the allocated reservation still exists - var remainingList v1alpha1.ReservationList - if err := client.List(context.Background(), &remainingList); err != nil { - t.Fatal(err) - } + if len(removed) != tt.expectedRemovedCount { + t.Fatalf("expected %d removed reservations, got %d", tt.expectedRemovedCount, len(removed)) + } - if len(remainingList.Items) != 1 { - t.Fatalf("expected 1 remaining reservation, got %d", len(remainingList.Items)) - } + if tt.validateRemoved != nil { + tt.validateRemoved(t, removed) + } + + // Get remaining reservations + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } - // Verify the remaining one has the allocation - if len(remainingList.Items[0].Spec.CommittedResourceReservation.Allocations) == 0 { - t.Error("expected allocated reservation to be preserved") + if tt.validateRemaining != nil { + tt.validateRemaining(t, remainingList.Items) + } + }) } } diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index 50108beef..183c91553 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -6,16 +6,17 @@ package commitments import ( "errors" "fmt" + "regexp" "strings" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" "github.com/sapcc/go-api-declarations/liquid" - ctrl "sigs.k8s.io/controller-runtime" ) -var stateLog = ctrl.Log.WithName("commitment_state") +// commitmentUUIDPattern validates commitment UUID format +var commitmentUUIDPattern = regexp.MustCompile(`^[a-zA-Z0-9-]{6,64}$`) // Limes LIQUID resource naming convention: ram_ const commitmentResourceNamePrefix = "ram_" @@ -52,6 +53,10 @@ func FromCommitment( commitment Commitment, flavorGroup compute.FlavorGroupFeature, ) (*CommitmentState, error) { + // Validate commitment UUID format + if !commitmentUUIDPattern.MatchString(commitment.UUID) { + return nil, errors.New("unexpected commitment format") + } flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) if err != nil { @@ -99,6 +104,11 @@ func FromChangeCommitmentTargetState( flavorGroup compute.FlavorGroupFeature, az string, ) (*CommitmentState, error) { + // Validate commitment UUID format + commitmentUUID := string(commitment.UUID) + if !commitmentUUIDPattern.MatchString(commitmentUUID) { + return nil, errors.New("unexpected commitment format") + } amountMultiple := uint64(0) var startTime *time.Time @@ -108,9 +118,15 @@ func FromChangeCommitmentTargetState( // guaranteed and confirmed commitments are honored with start time now case liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed: amountMultiple = commitment.Amount - // Set start time to now for active commitments - now := time.Now() - startTime = &now + // Set start time: use ConfirmBy if available (when the commitment was confirmed), + // otherwise use time.Now() for immediate confirmation + confirmByTime := commitment.ConfirmBy.UnwrapOr(time.Time{}) + if !confirmByTime.IsZero() { + startTime = &confirmByTime + } else { + now := time.Now() + startTime = &now + } } // ConfirmBy is ignored for now @@ -184,7 +200,7 @@ func FromReservations(reservations []v1alpha1.Reservation) (*CommitmentState, er // check flavor group consistency, ignore if not matching to repair corrupted state in k8s if res.Spec.CommittedResourceReservation.ResourceGroup != state.FlavorGroupName { // log message - stateLog.Error(errors.New("inconsistent flavor group in reservation"), + commitmentLog.Error(errors.New("inconsistent flavor group in reservation"), "reservation belongs to same commitment but has different flavor group - ignoring reservation for capacity calculation", "reservationName", res.Name, "expectedFlavorGroup", state.FlavorGroupName, diff --git a/internal/scheduling/reservations/controller/monitor.go b/internal/scheduling/reservations/monitor.go similarity index 97% rename from internal/scheduling/reservations/controller/monitor.go rename to internal/scheduling/reservations/monitor.go index 0c0ad2875..2050cf880 100644 --- a/internal/scheduling/reservations/controller/monitor.go +++ b/internal/scheduling/reservations/monitor.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package controller +package reservations import ( "context" @@ -29,7 +29,7 @@ type Monitor struct { reservedResources *prometheus.GaugeVec } -func NewControllerMonitor(k8sClient client.Client) Monitor { +func NewMonitor(k8sClient client.Client) Monitor { return Monitor{ Client: k8sClient, numberOfReservations: prometheus.NewGaugeVec(prometheus.GaugeOpts{ diff --git a/internal/scheduling/reservations/controller/monitor_test.go b/internal/scheduling/reservations/monitor_test.go similarity index 97% rename from internal/scheduling/reservations/controller/monitor_test.go rename to internal/scheduling/reservations/monitor_test.go index eef11892e..cc479955a 100644 --- a/internal/scheduling/reservations/controller/monitor_test.go +++ b/internal/scheduling/reservations/monitor_test.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package controller +package reservations import ( "testing" @@ -20,7 +20,7 @@ import ( ) func TestMonitor_Init(t *testing.T) { - monitor := NewControllerMonitor(nil) + monitor := NewMonitor(nil) if monitor.numberOfReservations == nil { t.Error("numberOfReservations metric should be initialized") @@ -32,7 +32,7 @@ func TestMonitor_Init(t *testing.T) { } func TestMonitor_Describe(t *testing.T) { - monitor := NewControllerMonitor(nil) + monitor := NewMonitor(nil) ch := make(chan *prometheus.Desc, 10) go func() { @@ -62,7 +62,7 @@ func TestMonitor_Collect_EmptyList(t *testing.T) { WithScheme(scheme). Build() - monitor := NewControllerMonitor(k8sClient) + monitor := NewMonitor(k8sClient) ch := make(chan prometheus.Metric, 10) go func() { @@ -177,7 +177,7 @@ func TestMonitor_Collect_WithReservations(t *testing.T) { WithObjects(objects...). Build() - monitor := NewControllerMonitor(k8sClient) + monitor := NewMonitor(k8sClient) ch := make(chan prometheus.Metric, 100) go func() { @@ -266,7 +266,7 @@ func TestMonitor_Collect_ResourceMetrics(t *testing.T) { WithObjects(reservation). Build() - monitor := NewControllerMonitor(k8sClient) + monitor := NewMonitor(k8sClient) ch := make(chan prometheus.Metric, 100) go func() { @@ -332,7 +332,7 @@ func TestMonitor_Collect_ErrorHandling(t *testing.T) { WithScheme(scheme). Build() - monitor := NewControllerMonitor(k8sClient) + monitor := NewMonitor(k8sClient) ch := make(chan prometheus.Metric, 10) go func() { @@ -390,7 +390,7 @@ func TestMonitor_Collect_LabelSanitization(t *testing.T) { WithObjects(reservation). Build() - monitor := NewControllerMonitor(client) + monitor := NewMonitor(client) ch := make(chan prometheus.Metric, 100) go func() { From cbd4504bce58f0a263ec6c1cc1c56dacda195fbe Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 19 Mar 2026 16:19:52 +0100 Subject: [PATCH 2/3] config --- helm/bundles/cortex-nova/values.yaml | 5 ++--- internal/scheduling/reservations/commitments/config.go | 10 +++------- .../scheduling/reservations/commitments/controller.go | 4 ++-- .../reservations/commitments/controller_test.go | 4 +--- internal/scheduling/reservations/commitments/state.go | 4 ++-- 5 files changed, 10 insertions(+), 17 deletions(-) diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 4be224d36..b2ec15a37 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -138,9 +138,8 @@ cortex-scheduling-controllers: - reservations-controller enabledTasks: - nova-decisions-cleanup-task - # Endpoints configuration for reservations controller - endpoints: - novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" + # NovaExternalScheduler is the URL of the nova external scheduler API for CR reservations + novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" # FlavorGroupPipelines maps flavor group IDs to pipeline names for CR reservations # This allows different scheduling strategies per flavor group (e.g., HANA vs GP) flavorGroupPipelines: diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index 94a4b80b1..3b9e90ed3 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -18,8 +18,8 @@ type Config struct { // PipelineDefault is the default pipeline used for scheduling committed resource reservations. PipelineDefault string `json:"pipelineDefault"` - // The endpoint where to find the nova external scheduler endpoint. - Endpoints EndpointsConfig `json:"endpoints"` + // NovaExternalScheduler is the endpoint where to find the nova external scheduler endpoint. + NovaExternalScheduler string `json:"novaExternalScheduler"` // Secret ref to SSO credentials stored in a k8s secret, if applicable. SSOSecretRef *corev1.SecretReference `json:"ssoSecretRef"` @@ -49,16 +49,12 @@ type Config struct { EnableChangeCommitmentsAPI bool `json:"enableChangeCommitmentsAPI"` } -type EndpointsConfig struct { - // The nova external scheduler endpoint. - NovaExternalScheduler string `json:"novaExternalScheduler"` -} - func DefaultConfig() Config { return Config{ RequeueIntervalActive: 5 * time.Minute, RequeueIntervalRetry: 1 * time.Minute, PipelineDefault: "kvm-committed-resource-reservation-general-purpose", + NovaExternalScheduler: "http://localhost:8080/scheduler/nova/external", ChangeAPIWatchReservationsTimeout: 10 * time.Second, ChangeAPIWatchReservationsPollInterval: 500 * time.Millisecond, EnableChangeCommitmentsAPI: true, diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 611d6e1f6..3b11a94a8 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -429,8 +429,8 @@ func (r *CommitmentReservationController) Init(ctx context.Context, client clien } // Initialize scheduler client - r.SchedulerClient = reservations.NewSchedulerClient(conf.Endpoints.NovaExternalScheduler) - logf.FromContext(ctx).Info("scheduler client initialized for commitment reservation controller", "url", conf.Endpoints.NovaExternalScheduler) + r.SchedulerClient = reservations.NewSchedulerClient(conf.NovaExternalScheduler) + logf.FromContext(ctx).Info("scheduler client initialized for commitment reservation controller", "url", conf.NovaExternalScheduler) return nil } diff --git a/internal/scheduling/reservations/commitments/controller_test.go b/internal/scheduling/reservations/commitments/controller_test.go index 8ef4b3b95..951188777 100644 --- a/internal/scheduling/reservations/commitments/controller_test.go +++ b/internal/scheduling/reservations/commitments/controller_test.go @@ -272,9 +272,7 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t defer server.Close() config := Config{ - Endpoints: EndpointsConfig{ - NovaExternalScheduler: server.URL, - }, + NovaExternalScheduler: server.URL, } reconciler := &CommitmentReservationController{ diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index 183c91553..bfa6fdb28 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -15,8 +15,8 @@ import ( "github.com/sapcc/go-api-declarations/liquid" ) -// commitmentUUIDPattern validates commitment UUID format -var commitmentUUIDPattern = regexp.MustCompile(`^[a-zA-Z0-9-]{6,64}$`) +// commitmentUUIDPattern validates commitment UUID format. +var commitmentUUIDPattern = regexp.MustCompile(`^[a-zA-Z0-9-]{6,40}$`) // Limes LIQUID resource naming convention: ram_ const commitmentResourceNamePrefix = "ram_" From dd75f6fcb0b18dcdf0af17df22eed54aaa0d664f Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 19 Mar 2026 17:13:36 +0100 Subject: [PATCH 3/3] move check --- .../reservations/commitments/controller.go | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 3b11a94a8..94a2f1d94 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -59,6 +59,33 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, client.IgnoreNotFound(err) } + // filter for CR reservations + resourceName := "" + if res.Spec.CommittedResourceReservation != nil { + resourceName = res.Spec.CommittedResourceReservation.ResourceName + } + if resourceName == "" { + logger.Info("reservation has no resource name, skipping") + old := res.DeepCopy() + meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionFalse, + Reason: "MissingResourceName", + Message: "reservation has no resource name", + }) + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, &res, patch); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + logger.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil + } + return ctrl.Result{}, nil // Don't need to requeue. + } + if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { logger.Info("reservation is active, verifying allocations") @@ -129,33 +156,6 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr logger.Info("synced spec to status", "host", res.Status.Host) } - // filter for CR reservations - resourceName := "" - if res.Spec.CommittedResourceReservation != nil { - resourceName = res.Spec.CommittedResourceReservation.ResourceName - } - if resourceName == "" { - logger.Info("reservation has no resource name, skipping") - old := res.DeepCopy() - meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ - Type: v1alpha1.ReservationConditionReady, - Status: metav1.ConditionFalse, - Reason: "MissingResourceName", - Message: "reservation has no resource name", - }) - patch := client.MergeFrom(old) - if err := r.Status().Patch(ctx, &res, patch); err != nil { - // Ignore not-found errors during background deletion - if client.IgnoreNotFound(err) != nil { - logger.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err - } - // Object was deleted, no need to continue - return ctrl.Result{}, nil - } - return ctrl.Result{}, nil // Don't need to requeue. - } - // Get project ID from CommittedResourceReservation spec if available. projectID := "" if res.Spec.CommittedResourceReservation != nil {