From db282613d1a2d88b311a9639806360500f2dfe0a Mon Sep 17 00:00:00 2001 From: David Joshy Date: Thu, 14 May 2026 15:52:50 -0400 Subject: [PATCH 1/3] bootimage: rename patchSkipped to reconcileSkipped --- pkg/controller/bootimage/ms_helpers.go | 15 ++++++++++----- pkg/controller/bootimage/platform_helpers.go | 17 ++++++++++------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/pkg/controller/bootimage/ms_helpers.go b/pkg/controller/bootimage/ms_helpers.go index 3380d07338..cfe6563966 100644 --- a/pkg/controller/bootimage/ms_helpers.go +++ b/pkg/controller/bootimage/ms_helpers.go @@ -80,7 +80,7 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing) for _, machineSet := range mapiMachineSets { - patchSkipped, err := ctrl.syncMAPIMachineSet(machineSet) + reconcileSkipped, err := ctrl.syncMAPIMachineSet(machineSet) if err == nil { ctrl.mapiStats.inProgress++ } else { @@ -88,7 +88,7 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { syncErrors = append(syncErrors, fmt.Errorf("error syncing MAPI MachineSet %s: %w", machineSet.Name, err)) ctrl.mapiStats.erroredCount++ } - if patchSkipped { + if reconcileSkipped { ctrl.mapiStats.skippedCount++ } // Update progressing conditions every step of the loop @@ -104,7 +104,12 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { } } -// syncMAPIMachineSet will attempt to reconcile the provided machineset +// syncMAPIMachineSet will attempt to reconcile the provided machineset. +// Returns (reconcileSkipped, error): reconcileSkipped=true means something blocked the +// boot image update that requires manual intervention; rather than returning an +// error immediately, the condition is surfaced via skew enforcement. +// reconcileSkipped=false means a patch was applied, the MachineSet was already up to +// date, or it is out of scope for the MAPI path (e.g. migrated to CAPI authority). func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet) (bool, error) { startTime := time.Now() @@ -190,7 +195,7 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet } // Check if the this MachineSet requires an update - patchRequired, patchSkipped, newMachineSet, err := checkMachineSet(infra, machineSet, configMap, arch, ctrl.kubeClient) + patchRequired, reconcileSkipped, newMachineSet, err := checkMachineSet(infra, machineSet, configMap, arch, ctrl.kubeClient) if err != nil { return false, fmt.Errorf("failed to reconcile machineset %s, err: %w", machineSet.Name, err) } @@ -208,7 +213,7 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet return false, nil } klog.Infof("No patching required for MAPI machineset %s", machineSet.Name) - return patchSkipped, nil + return reconcileSkipped, nil } // getMAPIBootImageValue returns the value used for hot loop detection. diff --git a/pkg/controller/bootimage/platform_helpers.go b/pkg/controller/bootimage/platform_helpers.go index ef354c4dc5..6581ab262d 100644 --- a/pkg/controller/bootimage/platform_helpers.go +++ b/pkg/controller/bootimage/platform_helpers.go @@ -42,9 +42,11 @@ type publisherOffer struct { publisher, offer string } -// This function calls the appropriate reconcile function based on the infra type -// On success, it will return a bool indicating if a patch is required, and an updated -// machineset object if any. It will return an error if any of the above steps fail. +// checkMachineSet calls the appropriate reconcile function based on the infra type. +// Returns (patchRequired, reconcileSkipped, newMachineSet, error). +// reconcileSkipped=true means the boot image could not be updated automatically (e.g. +// custom or unknown image) and requires manual intervention; the condition is surfaced +// via skew enforcement rather than returned as an error. func checkMachineSet(infra *osconfigv1.Infrastructure, machineSet *machinev1beta1.MachineSet, configMap *corev1.ConfigMap, arch string, secretClient clientset.Interface) (bool, bool, *machinev1beta1.MachineSet, error) { switch infra.Status.PlatformStatus.Type { case osconfigv1.AWSPlatformType: @@ -61,7 +63,8 @@ func checkMachineSet(infra *osconfigv1.Infrastructure, machineSet *machinev1beta } } -// Generic reconcile function that handles the common pattern across all platforms +// reconcilePlatform is a generic reconcile function that handles the common pattern across all platforms. +// Returns (patchRequired, reconcileSkipped, newMachineSet, error). See checkMachineSet for reconcileSkipped semantics. // nolint:dupl // I separated this from reconcilePlatformCPMS for readability func reconcilePlatform[T any]( machineSet *machinev1beta1.MachineSet, @@ -70,7 +73,7 @@ func reconcilePlatform[T any]( arch string, secretClient clientset.Interface, reconcileProviderSpec func(*stream.Stream, string, *osconfigv1.Infrastructure, *T, string, clientset.Interface) (bool, bool, *T, error), -) (patchRequired, patchSkipped bool, newMachineSet *machinev1beta1.MachineSet, err error) { +) (patchRequired, reconcileSkipped bool, newMachineSet *machinev1beta1.MachineSet, err error) { klog.Infof("Reconciling MAPI machineset %s on %s, with arch %s", machineSet.Name, string(infra.Status.PlatformStatus.Type), arch) // Unmarshal the provider spec @@ -86,14 +89,14 @@ func reconcilePlatform[T any]( } // Reconcile the provider spec - patchRequired, patchSkipped, newProviderSpec, err := reconcileProviderSpec(streamData, arch, infra, providerSpec, machineSet.Name, secretClient) + patchRequired, reconcileSkipped, newProviderSpec, err := reconcileProviderSpec(streamData, arch, infra, providerSpec, machineSet.Name, secretClient) if err != nil { return false, false, nil, err } // If no patch is required, exit early if !patchRequired { - return false, patchSkipped, nil, nil + return false, reconcileSkipped, nil, nil } // If patch is required, marshal the new providerspec into the machineset From fa12d23ea13cb5dddf84aa060bcf2e345946e41a Mon Sep 17 00:00:00 2001 From: David Joshy Date: Tue, 9 Jun 2026 13:24:11 -0400 Subject: [PATCH 2/3] bootimage: remove configmap version checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit isClusterStable() already gates the entire sync loop — the operator stamps the configmap with the MCO version hash and OCP release version as soon as the first master node updates, which always happens before ClusterVersion marks the upgrade complete. By the time syncMAPIMachineSets and syncControlPlaneMachineSet run, the configmap is guaranteed to be up to date. --- pkg/controller/bootimage/cpms_helpers.go | 25 +------------ pkg/controller/bootimage/ms_helpers.go | 47 ++++++++---------------- 2 files changed, 17 insertions(+), 55 deletions(-) diff --git a/pkg/controller/bootimage/cpms_helpers.go b/pkg/controller/bootimage/cpms_helpers.go index b362275395..13270630d2 100644 --- a/pkg/controller/bootimage/cpms_helpers.go +++ b/pkg/controller/bootimage/cpms_helpers.go @@ -14,7 +14,6 @@ import ( machinev1 "github.com/openshift/api/machine/v1" opv1 "github.com/openshift/api/operator/v1" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" - operatorversion "github.com/openshift/machine-config-operator/pkg/version" "sigs.k8s.io/yaml" corev1 "k8s.io/api/core/v1" @@ -145,31 +144,9 @@ func (ctrl *Controller) syncControlPlaneMachineSet(controlPlaneMachineSet *machi return fmt.Errorf("failed to fetch infra object during ControlPlaneMachineSet sync: %w", err) } - // Fetch the bootimage configmap & ensure it has been stamped by the operator. This is done by - // the operator when a master node successfully updates to a new image. This is - // to prevent machinesets from being updated before the operator itself has updated. - // If it hasn't been updated, exit and wait for a resync. configMap, err := ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName) if err != nil { - return fmt.Errorf("failed to fetch coreos-bootimages config map duringControlPlaneMachineSet sync: %w", err) - } - versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey] - if !versionHashFound { - klog.Infof("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) - return nil - } - if versionHashFromCM != operatorversion.Hash { - klog.Infof("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete") - return nil - } - releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.OCPReleaseVersionKey] - if !releaseVersionFound { - klog.Infof("failed to find OCP release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) - return nil - } - if releaseVersionFromCM != operatorversion.ReleaseVersion { - klog.Infof("mismatch between OCP release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete") - return nil + return fmt.Errorf("failed to fetch coreos-bootimages config map during ControlPlaneMachineSet sync: %w", err) } // Check if the this ControlPlaneMachineSet requires an update diff --git a/pkg/controller/bootimage/ms_helpers.go b/pkg/controller/bootimage/ms_helpers.go index cfe6563966..b13f5aa196 100644 --- a/pkg/controller/bootimage/ms_helpers.go +++ b/pkg/controller/bootimage/ms_helpers.go @@ -13,7 +13,6 @@ import ( machinev1beta1 "github.com/openshift/api/machine/v1beta1" opv1 "github.com/openshift/api/operator/v1" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" - operatorversion "github.com/openshift/machine-config-operator/pkg/version" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" kubeErrs "k8s.io/apimachinery/pkg/util/errors" @@ -69,6 +68,17 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { } } + var configMap *corev1.ConfigMap + // The configMap is not needed if no resources are being reconciled; so check that first before making the API call. + if len(mapiMachineSets) > 0 { + configMap, err = ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName) + if err != nil { + klog.Errorf("failed to fetch coreos-bootimages config map: %v", err) + ctrl.updateConditions(reason, fmt.Errorf("failed to fetch coreos-bootimages config map: %w", err), opv1.MachineConfigurationBootImageUpdateDegraded) + return + } + } + // Reset stats before initiating reconciliation loop ctrl.mapiStats.inProgress = 0 ctrl.mapiStats.totalCount = len(mapiMachineSets) @@ -80,7 +90,7 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing) for _, machineSet := range mapiMachineSets { - reconcileSkipped, err := ctrl.syncMAPIMachineSet(machineSet) + reconcileSkipped, err := ctrl.syncMAPIMachineSet(machineSet, configMap) if err == nil { ctrl.mapiStats.inProgress++ } else { @@ -167,40 +177,15 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet return false, fmt.Errorf("failed to fetch infra object during machineset sync: %w", err) } - // Fetch the bootimage configmap & ensure it has been stamped by the operator. This is done by - // the operator when a master node successfully updates to a new image. This is - // to prevent machinesets from being updated before the operator itself has updated. - // If it hasn't been updated, exit and wait for a resync. - configMap, err := ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName) - if err != nil { - return false, fmt.Errorf("failed to fetch coreos-bootimages config map during machineset sync: %w", err) - } - versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey] - if !versionHashFound { - klog.Infof("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) - return true, nil - } - if versionHashFromCM != operatorversion.Hash { - klog.Infof("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete") - return true, nil - } - releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.OCPReleaseVersionKey] - if !releaseVersionFound { - klog.Infof("failed to find OCP release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) - return true, nil - } - if releaseVersionFromCM != operatorversion.ReleaseVersion { - klog.Infof("mismatch between OCP release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete") - return true, nil - } - // Check if the this MachineSet requires an update patchRequired, reconcileSkipped, newMachineSet, err := checkMachineSet(infra, machineSet, configMap, arch, ctrl.kubeClient) if err != nil { return false, fmt.Errorf("failed to reconcile machineset %s, err: %w", machineSet.Name, err) } - // Patch the machineset if required + if reconcileSkipped { + return true, nil + } if patchRequired { if ctrl.checkMAPIMachineSetHotLoop(newMachineSet, configMap, infra, arch) { return false, fmt.Errorf("refusing to reconcile machineset %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", machineSet.Name) @@ -213,7 +198,7 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet return false, nil } klog.Infof("No patching required for MAPI machineset %s", machineSet.Name) - return reconcileSkipped, nil + return false, nil } // getMAPIBootImageValue returns the value used for hot loop detection. From f617a2c5443c6e8d998d4035f4daa2dbc2d5a467 Mon Sep 17 00:00:00 2001 From: David Joshy Date: Tue, 9 Jun 2026 14:02:04 -0400 Subject: [PATCH 3/3] bootimage: add reset cluster bootimage capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When one or more MachineSets are reconcile-skipped (e.g. due to an OwnerReference, missing arch annotation, or an unrecognisable boot image), the ClusterBootImageAutomatic record becomes stale: a newly added or modified MachineSet may be running an older image, but the recorded OCP version no longer reflects that. The skew check then silently passes when it should not. resetClusterBootImage() addresses this by writing the cluster's install version into ClusterBootImageAutomatic whenever a reconcile-skip occurs without an accompanying error — the most conservative value we can reliably claim — so the skew check correctly surfaces a violation if the cluster is genuinely out of date. --- pkg/apihelpers/apihelpers.go | 31 +++++ .../bootimage/boot_image_controller.go | 38 +++++- .../bootimage/boot_image_controller_test.go | 109 +++++++++++++++++- pkg/controller/bootimage/ms_helpers.go | 15 ++- pkg/operator/sync.go | 35 +----- 5 files changed, 193 insertions(+), 35 deletions(-) diff --git a/pkg/apihelpers/apihelpers.go b/pkg/apihelpers/apihelpers.go index 15277963fa..d62318e241 100644 --- a/pkg/apihelpers/apihelpers.go +++ b/pkg/apihelpers/apihelpers.go @@ -7,6 +7,7 @@ package apihelpers import ( "fmt" + configv1 "github.com/openshift/api/config/v1" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" opv1 "github.com/openshift/api/operator/v1" "github.com/openshift/machine-config-operator/pkg/daemon/constants" @@ -16,6 +17,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" + k8sversion "k8s.io/apimachinery/pkg/util/version" "k8s.io/klog/v2" ) @@ -657,3 +659,32 @@ func GetSkewEnforcementStatusNone() opv1.BootImageSkewEnforcementStatus { Mode: opv1.BootImageSkewEnforcementModeStatusNone, } } + +// GetOCPInstallVersion extracts the install-time OCP version from +// ClusterVersion history — the oldest completed update entry, normalised to Major.Minor.Patch. +// Returns "0.0.0" as a conservative fallback if the history is empty or the version cannot be parsed. +func GetOCPInstallVersion(clusterVersion *configv1.ClusterVersion) string { + // This is not possible, but return a sane value to avoid API validation failures. + if len(clusterVersion.Status.History) == 0 { + klog.Warning("ClusterVersion has no history, cannot determine install version") + return "0.0.0" + } + var installVersion string + for i := len(clusterVersion.Status.History) - 1; i >= 0; i-- { + if clusterVersion.Status.History[i].State == configv1.CompletedUpdate { + installVersion = clusterVersion.Status.History[i].Version + break + } + } + if installVersion == "" { + installVersion = clusterVersion.Status.History[len(clusterVersion.Status.History)-1].Version + } + parsedVersion, err := k8sversion.ParseGeneric(installVersion) + if err != nil { + // Version strings from ClusterVersion history are always valid semver in practice. + // Log a warning and return 0.0.0 as a sanity check. + klog.Warningf("Failed to parse install version %q: %v", installVersion, err) + return "0.0.0" + } + return fmt.Sprintf("%d.%d.%d", parsedVersion.Major(), parsedVersion.Minor(), parsedVersion.Patch()) +} diff --git a/pkg/controller/bootimage/boot_image_controller.go b/pkg/controller/bootimage/boot_image_controller.go index c69788f974..8e54b0e111 100644 --- a/pkg/controller/bootimage/boot_image_controller.go +++ b/pkg/controller/bootimage/boot_image_controller.go @@ -39,6 +39,7 @@ import ( mcopinformersv1 "github.com/openshift/client-go/operator/informers/externalversions/operator/v1" mcoplistersv1 "github.com/openshift/client-go/operator/listers/operator/v1" + apihelpers "github.com/openshift/machine-config-operator/pkg/apihelpers" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/pkg/osimagestream" ) @@ -628,7 +629,42 @@ func (ctrl *Controller) updateClusterBootImage() { } // Only make an API call if there is an update to the skew enforcement status - if !reflect.DeepEqual(mcop.Status.BootImageSkewEnforcementStatus, newBootImageSkewEnforcementStatus) { + if !reflect.DeepEqual(mcop.Status.BootImageSkewEnforcementStatus, *newBootImageSkewEnforcementStatus) { + mcop.Status.BootImageSkewEnforcementStatus = *newBootImageSkewEnforcementStatus + ctrl.updateMachineConfigurationStatus(mcop.Status) + } +} + +// resetClusterBootImage writes the cluster's install OCP version into ClusterBootImageAutomatic when +// one or more MachineSets were reconcileSkipped. The recorded value can no longer be trusted as an +// accurate description of the full cluster state (a newly-added or changed MachineSet may be on an +// older image), so we fall back to the install version — the oldest version we can reliably claim — +// so that the skew check correctly surfaces a violation if the cluster is genuinely out of date. +func (ctrl *Controller) resetClusterBootImage() { + mcop, err := ctrl.mcopClient.OperatorV1().MachineConfigurations().Get(context.TODO(), ctrlcommon.MCOOperatorKnobsObjectName, metav1.GetOptions{}) + if err != nil { + klog.Errorf("error resetting cluster boot image record: %s", err) + return + } + if mcop.Status.BootImageSkewEnforcementStatus.Mode != opv1.BootImageSkewEnforcementModeStatusAutomatic { + return + } + + // Default to a conservative value to be on the safe side. + ocpVersion := "0.0.0" + clusterVersion, err := ctrl.clusterVersionLister.Get("version") + if err != nil { + klog.Warningf("Failed to get ClusterVersion for boot image reset: %v", err) + } else { + ocpVersion = apihelpers.GetOCPInstallVersion(clusterVersion) + } + + newBootImageSkewEnforcementStatus := mcop.Status.BootImageSkewEnforcementStatus.DeepCopy() + newBootImageSkewEnforcementStatus.Automatic = opv1.ClusterBootImageAutomatic{ + OCPVersion: ocpVersion, + } + if !reflect.DeepEqual(mcop.Status.BootImageSkewEnforcementStatus, *newBootImageSkewEnforcementStatus) { + klog.Infof("Resetting cluster boot image record to install version %s due to reconcileSkipped MachineSets", ocpVersion) mcop.Status.BootImageSkewEnforcementStatus = *newBootImageSkewEnforcementStatus ctrl.updateMachineConfigurationStatus(mcop.Status) } diff --git a/pkg/controller/bootimage/boot_image_controller_test.go b/pkg/controller/bootimage/boot_image_controller_test.go index 6ae3b4f980..4bdc47171a 100644 --- a/pkg/controller/bootimage/boot_image_controller_test.go +++ b/pkg/controller/bootimage/boot_image_controller_test.go @@ -1,13 +1,16 @@ package bootimage import ( + "context" "testing" "github.com/coreos/stream-metadata-go/stream" "github.com/coreos/stream-metadata-go/stream/rhcos" osconfigv1 "github.com/openshift/api/config/v1" machinev1beta1 "github.com/openshift/api/machine/v1beta1" + opv1 "github.com/openshift/api/operator/v1" configlistersv1 "github.com/openshift/client-go/config/listers/config/v1" + fakemcopclient "github.com/openshift/client-go/operator/clientset/versioned/fake" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -16,6 +19,7 @@ import ( "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" ) func TestIsClusterStable(t *testing.T) { @@ -533,7 +537,7 @@ func TestReconcileAzureProviderSpec(t *testing.T) { }, } - fakeClient := fake.NewSimpleClientset(testSecret) + fakeClient := fake.NewClientset(testSecret) tests := []struct { name string @@ -969,3 +973,106 @@ func TestReconcileAzureProviderSpec(t *testing.T) { }) } } + +func TestResetClusterBootImage(t *testing.T) { + cases := []struct { + name string + history []osconfigv1.UpdateHistory + mode opv1.BootImageSkewEnforcementModeStatus + initialOCPVersion string + expectedVersion string + }{ + { + name: "resets to oldest completed entry (install version)", + history: []osconfigv1.UpdateHistory{ + {State: osconfigv1.CompletedUpdate, Version: "4.19.0"}, + {State: osconfigv1.CompletedUpdate, Version: "4.18.0"}, + }, + mode: opv1.BootImageSkewEnforcementModeStatusAutomatic, + initialOCPVersion: "4.19.0", + expectedVersion: "4.18.0", + }, + { + name: "no-op when mode is not automatic", + history: []osconfigv1.UpdateHistory{ + {State: osconfigv1.CompletedUpdate, Version: "4.18.0"}, + }, + mode: opv1.BootImageSkewEnforcementModeStatusNone, + initialOCPVersion: "4.19.0", + expectedVersion: "4.19.0", + }, + { + name: "no-op when already at install version", + history: []osconfigv1.UpdateHistory{ + {State: osconfigv1.CompletedUpdate, Version: "4.18.0"}, + }, + mode: opv1.BootImageSkewEnforcementModeStatusAutomatic, + initialOCPVersion: "4.18.0", + expectedVersion: "4.18.0", + }, + { + name: "falls back to 0.0.0 when history is empty", + history: []osconfigv1.UpdateHistory{}, + mode: opv1.BootImageSkewEnforcementModeStatusAutomatic, + initialOCPVersion: "4.19.0", + expectedVersion: "0.0.0", + }, + { + name: "uses last entry when no completed update exists", + history: []osconfigv1.UpdateHistory{ + {State: osconfigv1.PartialUpdate, Version: "4.18.0"}, + }, + mode: opv1.BootImageSkewEnforcementModeStatusAutomatic, + initialOCPVersion: "4.19.0", + expectedVersion: "4.18.0", + }, + { + // history==nil is the sentinel for "do not populate the lister", + // exercising the clusterVersionLister.Get() error path. + name: "falls back to 0.0.0 when ClusterVersion unavailable", + history: nil, + mode: opv1.BootImageSkewEnforcementModeStatusAutomatic, + initialOCPVersion: "4.19.0", + expectedVersion: "0.0.0", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + mcop := &opv1.MachineConfiguration{ + ObjectMeta: v1.ObjectMeta{Name: ctrlcommon.MCOOperatorKnobsObjectName}, + Status: opv1.MachineConfigurationStatus{ + BootImageSkewEnforcementStatus: opv1.BootImageSkewEnforcementStatus{ + Mode: tc.mode, + Automatic: opv1.ClusterBootImageAutomatic{ + OCPVersion: tc.initialOCPVersion, + }, + }, + }, + } + + cvIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) + if tc.history != nil { + cv := &osconfigv1.ClusterVersion{ + ObjectMeta: v1.ObjectMeta{Name: "version"}, + Status: osconfigv1.ClusterVersionStatus{History: tc.history}, + } + require.NoError(t, cvIndexer.Add(cv)) + } + + fakeMcopClient := fakemcopclient.NewClientset(mcop) + ctrl := &Controller{ + mcopClient: fakeMcopClient, + clusterVersionLister: configlistersv1.NewClusterVersionLister(cvIndexer), + } + + ctrl.resetClusterBootImage() + + updated, err := fakeMcopClient.OperatorV1().MachineConfigurations().Get( + context.TODO(), ctrlcommon.MCOOperatorKnobsObjectName, v1.GetOptions{}) + require.NoError(t, err) + assert.Equal(t, tc.expectedVersion, + updated.Status.BootImageSkewEnforcementStatus.Automatic.OCPVersion) + }) + } +} diff --git a/pkg/controller/bootimage/ms_helpers.go b/pkg/controller/bootimage/ms_helpers.go index b13f5aa196..a4bd05fb84 100644 --- a/pkg/controller/bootimage/ms_helpers.go +++ b/pkg/controller/bootimage/ms_helpers.go @@ -106,11 +106,20 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { } // Update/Clear degrade conditions based on errors from this loop ctrl.updateConditions(reason, kubeErrs.NewAggregate(syncErrors), opv1.MachineConfigurationBootImageUpdateDegraded) - // If no machinesets were skipped, update the cluster boot image record if ctrl.fgHandler.Enabled(features.FeatureGateBootImageSkewEnforcement) { - if ctrl.mapiStats.skippedCount == 0 { + switch { + case ctrl.mapiStats.skippedCount == 0 && len(syncErrors) == 0: + // All MachineSets reconciled cleanly — record the current OCP version. ctrl.updateClusterBootImage() + case ctrl.mapiStats.skippedCount > 0 && len(syncErrors) == 0: + // One or more MachineSets were reconcileSkipped without an error. The existing + // ClusterBootImageAutomatic value is no longer trustworthy (a new or changed + // MachineSet may be running an older image), so reset it to the cluster install + // version to ensure the skew check surfaces a violation if warranted. + ctrl.resetClusterBootImage() } + // Errors (syncErrors > 0) are already surfaced via the Degraded condition, which + // checkBootImageControllerReady checks first — no boot image record update needed. } } @@ -120,7 +129,7 @@ func (ctrl *Controller) syncMAPIMachineSets(reason string) { // error immediately, the condition is surfaced via skew enforcement. // reconcileSkipped=false means a patch was applied, the MachineSet was already up to // date, or it is out of scope for the MAPI path (e.g. migrated to CAPI authority). -func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet) (bool, error) { +func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet, configMap *corev1.ConfigMap) (bool, error) { startTime := time.Now() klog.V(4).Infof("Started syncing MAPI machineset %q (%v)", machineSet.Name, startTime) diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index cff069be95..c4c1a9d1ef 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -2560,7 +2560,7 @@ func (optr *Operator) syncBootImageSkewEnforcementStatus(mcop *opv1.MachineConfi } // Grab install time OCP version - ocpVersionAtInstall := optr.getOCPInstallVersionFromClusterVersion() + ocpVersionAtInstall := optr.getOCPInstallVersion() // Spec override takes priority over all platform defaults. if mcop.Spec.BootImageSkewEnforcement != (opv1.BootImageSkewEnforcementConfig{}) { @@ -2601,40 +2601,15 @@ func (optr *Operator) syncBootImageSkewEnforcementStatus(mcop *opv1.MachineConfi newMachineConfigurationStatus.BootImageSkewEnforcementStatus = apihelpers.GetSkewEnforcementStatusManualWithOCPVersion(ocpVersionAtInstall) } -// getOCPInstallVersionFromClusterVersion extracts the original install version from ClusterVersion history. -// It finds the oldest completed update in history and parses it to a clean version string. -// Returns an empty string if ClusterVersion cannot be retrieved or parsed. -func (optr *Operator) getOCPInstallVersionFromClusterVersion() string { +// getOCPInstallVersion extracts the original install version from ClusterVersion history. +// Returns "0.0.0" if the version cannot be determined, so that downstream skew checks always have a value to compare. +func (optr *Operator) getOCPInstallVersion() string { clusterVersion, err := optr.clusterVersionLister.Get("version") if err != nil { klog.Warningf("Failed to get ClusterVersion: %v, skipping boot image skew enforcement configuration", err) - return "" - } - if len(clusterVersion.Status.History) == 0 { - klog.Warningf("ClusterVersion has no history, skipping boot image skew enforcement configuration") - return "" - } - - // History is ordered by recency (newest first), so find the last completed entry (install version) - var installVersion string - for i := len(clusterVersion.Status.History) - 1; i >= 0; i-- { - if clusterVersion.Status.History[i].State == configv1.CompletedUpdate { - installVersion = clusterVersion.Status.History[i].Version - break - } - } - // ClusterVersion has no completed updates in history(install likely on-going), default to last value - if installVersion == "" { - klog.Warningf("ClusterVersion has no completed updates in history(install likely on-going), default to last value") - installVersion = clusterVersion.Status.History[len(clusterVersion.Status.History)-1].Version - } - // Scrape away CI/nightly tags if needed - parsedVersion, err := k8sversion.ParseGeneric(installVersion) - if err != nil { - klog.Warningf("Failed to parse install version %q: %v, use a placeholder for now", installVersion, err) return "0.0.0" } - return fmt.Sprintf("%d.%d.%d", parsedVersion.Major(), parsedVersion.Minor(), parsedVersion.Patch()) + return apihelpers.GetOCPInstallVersion(clusterVersion) } // getCurrentOCPVersionFromClusterVersion extracts the latest OCP version from ClusterVersion history.