diff --git a/api/v1alpha1/nodereadinessrule_types.go b/api/v1alpha1/nodereadinessrule_types.go index 98f0b43..b2a021f 100644 --- a/api/v1alpha1/nodereadinessrule_types.go +++ b/api/v1alpha1/nodereadinessrule_types.go @@ -112,33 +112,10 @@ type NodeReadinessRuleStatus struct { // +kubebuilder:validation:Minimum=1 ObservedGeneration int64 `json:"observedGeneration,omitempty"` - // appliedNodes lists the names of Nodes where the taint has been successfully managed. - // This provides a quick reference to the scope of impact for this rule. + // lastEvaluationTime is the timestamp when the rule was evaluated against all the nodes in the cluster. // - // +optional - // +listType=set - // +kubebuilder:validation:MaxItems=5000 - // +kubebuilder:validation:items:MaxLength=253 - AppliedNodes []string `json:"appliedNodes,omitempty"` - - // failedNodes lists the Nodes where the rule evaluation encountered an error. - // This is used for troubleshooting configuration issues, such as invalid selectors during node lookup. - // - // +optional - // +listType=map - // +listMapKey=nodeName - // +kubebuilder:validation:MaxItems=5000 - FailedNodes []NodeFailure `json:"failedNodes,omitempty"` - - // nodeEvaluations provides detailed insight into the rule's assessment for individual Nodes. - // This is primarily used for auditing and debugging why specific Nodes were or - // were not targeted by the rule. - // - // +optional - // +listType=map - // +listMapKey=nodeName - // +kubebuilder:validation:MaxItems=5000 - NodeEvaluations []NodeEvaluation `json:"nodeEvaluations,omitempty"` + // +required + LastEvaluationTime metav1.Time `json:"lastEvaluationTime,omitempty,omitzero"` // dryRunResults captures the outcome of the rule evaluation when DryRun is enabled. // This field provides visibility into the actions the controller would have taken, @@ -148,93 +125,6 @@ type NodeReadinessRuleStatus struct { DryRunResults DryRunResults `json:"dryRunResults,omitempty,omitzero"` } -// NodeFailure provides diagnostic details for Nodes that could not be successfully evaluated by the rule. -type NodeFailure struct { - // nodeName is the name of the failed Node. - // - // Following kubebuilder validation is referred from - // https://github.com/kubernetes/apimachinery/blob/84d740c9e27f3ccc94c8bc4d13f1b17f60f7080b/pkg/util/validation/validation.go#L198 - // - // +required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=253 - // +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` - NodeName string `json:"nodeName,omitempty"` - - // reason provides a brief explanation of the evaluation result. - // - // +optional - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=256 - Reason string `json:"reason,omitempty"` - - // message is a human-readable message indicating details about the evaluation. - // - // +optional - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=10240 - Message string `json:"message,omitempty"` - - // lastEvaluationTime is the timestamp of the last rule check failed for this Node. - // - // +required - LastEvaluationTime metav1.Time `json:"lastEvaluationTime,omitempty,omitzero"` -} - -// NodeEvaluation provides a detailed audit of a single Node's compliance with the rule. -type NodeEvaluation struct { - // nodeName is the name of the evaluated Node. - // - // +required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=253 - // +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` - NodeName string `json:"nodeName,omitempty"` - - // conditionResults provides a detailed breakdown of each condition evaluation - // for this Node. This allows for granular auditing of which specific - // criteria passed or failed during the rule assessment. - // - // +required - // +listType=map - // +listMapKey=type - // +kubebuilder:validation:MaxItems=5000 - ConditionResults []ConditionEvaluationResult `json:"conditionResults,omitempty"` - - // taintStatus represents the taint status on the Node, one of Present, Absent. - // - // +required - TaintStatus TaintStatus `json:"taintStatus,omitempty"` - - // lastEvaluationTime is the timestamp when the controller last assessed this Node. - // - // +required - LastEvaluationTime metav1.Time `json:"lastEvaluationTime,omitempty,omitzero"` -} - -// ConditionEvaluationResult provides a detailed report of the comparison between -// the Node's observed condition and the rule's requirement. -type ConditionEvaluationResult struct { - // type corresponds to the Node condition type being evaluated. - // - // +required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=316 - Type string `json:"type,omitempty"` - - // currentStatus is the actual status value observed on the Node, one of True, False, Unknown. - // - // +required - // +kubebuilder:validation:Enum=True;False;Unknown - CurrentStatus corev1.ConditionStatus `json:"currentStatus,omitempty"` - - // requiredStatus is the status value defined in the rule that must be matched, one of True, False, Unknown. - // - // +required - // +kubebuilder:validation:Enum=True;False;Unknown - RequiredStatus corev1.ConditionStatus `json:"requiredStatus,omitempty"` -} - // DryRunResults provides a summary of the actions the controller would perform if DryRun mode is enabled. // +kubebuilder:validation:MinProperties=1 type DryRunResults struct { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index e8c4e61..4b16232 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -24,21 +24,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ConditionEvaluationResult) DeepCopyInto(out *ConditionEvaluationResult) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ConditionEvaluationResult. -func (in *ConditionEvaluationResult) DeepCopy() *ConditionEvaluationResult { - if in == nil { - return nil - } - out := new(ConditionEvaluationResult) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ConditionRequirement) DeepCopyInto(out *ConditionRequirement) { *out = *in @@ -89,43 +74,6 @@ func (in *DryRunResults) DeepCopy() *DryRunResults { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *NodeEvaluation) DeepCopyInto(out *NodeEvaluation) { - *out = *in - if in.ConditionResults != nil { - in, out := &in.ConditionResults, &out.ConditionResults - *out = make([]ConditionEvaluationResult, len(*in)) - copy(*out, *in) - } - in.LastEvaluationTime.DeepCopyInto(&out.LastEvaluationTime) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeEvaluation. -func (in *NodeEvaluation) DeepCopy() *NodeEvaluation { - if in == nil { - return nil - } - out := new(NodeEvaluation) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *NodeFailure) DeepCopyInto(out *NodeFailure) { - *out = *in - in.LastEvaluationTime.DeepCopyInto(&out.LastEvaluationTime) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeFailure. -func (in *NodeFailure) DeepCopy() *NodeFailure { - if in == nil { - return nil - } - out := new(NodeFailure) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeReadinessRule) DeepCopyInto(out *NodeReadinessRule) { *out = *in @@ -210,25 +158,7 @@ func (in *NodeReadinessRuleSpec) DeepCopy() *NodeReadinessRuleSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeReadinessRuleStatus) DeepCopyInto(out *NodeReadinessRuleStatus) { *out = *in - if in.AppliedNodes != nil { - in, out := &in.AppliedNodes, &out.AppliedNodes - *out = make([]string, len(*in)) - copy(*out, *in) - } - if in.FailedNodes != nil { - in, out := &in.FailedNodes, &out.FailedNodes - *out = make([]NodeFailure, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } - if in.NodeEvaluations != nil { - in, out := &in.NodeEvaluations, &out.NodeEvaluations - *out = make([]NodeEvaluation, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } + in.LastEvaluationTime.DeepCopyInto(&out.LastEvaluationTime) in.DryRunResults.DeepCopyInto(&out.DryRunResults) } diff --git a/config/crd/bases/readiness.node.x-k8s.io_nodereadinessrules.yaml b/config/crd/bases/readiness.node.x-k8s.io_nodereadinessrules.yaml index 0772190..b02311f 100644 --- a/config/crd/bases/readiness.node.x-k8s.io_nodereadinessrules.yaml +++ b/config/crd/bases/readiness.node.x-k8s.io_nodereadinessrules.yaml @@ -189,16 +189,6 @@ spec: description: status defines the observed state of NodeReadinessRule minProperties: 1 properties: - appliedNodes: - description: |- - appliedNodes lists the names of Nodes where the taint has been successfully managed. - This provides a quick reference to the scope of impact for this rule. - items: - maxLength: 253 - type: string - maxItems: 5000 - type: array - x-kubernetes-list-type: set dryRunResults: description: |- dryRunResults captures the outcome of the rule evaluation when DryRun is enabled. @@ -242,137 +232,19 @@ spec: required: - summary type: object - failedNodes: - description: |- - failedNodes lists the Nodes where the rule evaluation encountered an error. - This is used for troubleshooting configuration issues, such as invalid selectors during node lookup. - items: - description: NodeFailure provides diagnostic details for Nodes that - could not be successfully evaluated by the rule. - properties: - lastEvaluationTime: - description: lastEvaluationTime is the timestamp of the last - rule check failed for this Node. - format: date-time - type: string - message: - description: message is a human-readable message indicating - details about the evaluation. - maxLength: 10240 - minLength: 1 - type: string - nodeName: - description: |- - nodeName is the name of the failed Node. - - Following kubebuilder validation is referred from - https://github.com/kubernetes/apimachinery/blob/84d740c9e27f3ccc94c8bc4d13f1b17f60f7080b/pkg/util/validation/validation.go#L198 - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - reason: - description: reason provides a brief explanation of the evaluation - result. - maxLength: 256 - minLength: 1 - type: string - required: - - lastEvaluationTime - - nodeName - type: object - maxItems: 5000 - type: array - x-kubernetes-list-map-keys: - - nodeName - x-kubernetes-list-type: map - nodeEvaluations: - description: |- - nodeEvaluations provides detailed insight into the rule's assessment for individual Nodes. - This is primarily used for auditing and debugging why specific Nodes were or - were not targeted by the rule. - items: - description: NodeEvaluation provides a detailed audit of a single - Node's compliance with the rule. - properties: - conditionResults: - description: |- - conditionResults provides a detailed breakdown of each condition evaluation - for this Node. This allows for granular auditing of which specific - criteria passed or failed during the rule assessment. - items: - description: |- - ConditionEvaluationResult provides a detailed report of the comparison between - the Node's observed condition and the rule's requirement. - properties: - currentStatus: - description: currentStatus is the actual status value - observed on the Node, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - requiredStatus: - description: requiredStatus is the status value defined - in the rule that must be matched, one of True, False, - Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type corresponds to the Node condition type - being evaluated. - maxLength: 316 - minLength: 1 - type: string - required: - - currentStatus - - requiredStatus - - type - type: object - maxItems: 5000 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - lastEvaluationTime: - description: lastEvaluationTime is the timestamp when the controller - last assessed this Node. - format: date-time - type: string - nodeName: - description: nodeName is the name of the evaluated Node. - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - taintStatus: - description: taintStatus represents the taint status on the - Node, one of Present, Absent. - enum: - - Present - - Absent - type: string - required: - - conditionResults - - lastEvaluationTime - - nodeName - - taintStatus - type: object - maxItems: 5000 - type: array - x-kubernetes-list-map-keys: - - nodeName - x-kubernetes-list-type: map + lastEvaluationTime: + description: lastEvaluationTime is the timestamp when the rule was + evaluated against all the nodes in the cluster. + format: date-time + type: string observedGeneration: description: observedGeneration reflects the generation of the most recently observed NodeReadinessRule by the controller. format: int64 minimum: 1 type: integer + required: + - lastEvaluationTime type: object required: - spec diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 0d36c19..6358a13 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -4,6 +4,13 @@ kind: ClusterRole metadata: name: manager-role rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch - apiGroups: - "" resources: diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go index 8d0e8fd..015d061 100644 --- a/internal/controller/node_controller.go +++ b/internal/controller/node_controller.go @@ -21,7 +21,6 @@ import ( "fmt" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" @@ -84,6 +83,7 @@ func (r *NodeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) // +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=nodes/status,verbs=get;update;patch // +kubebuilder:rbac:groups=core,resources=nodes/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch // NodeReconciler handles node changes @@ -139,85 +139,18 @@ func (r *RuleReadinessController) processNodeAgainstAllRules(ctx context.Context continue } - log.Info("Evaluating rule for node", - "node", node.Name, - "rule", rule.Name, - "ruleResourceVersion", rule.ResourceVersion) - if err := r.evaluateRuleForNode(ctx, rule, node); err != nil { + // Log error and record event but continue with other rules. log.Error(err, "Failed to evaluate rule for node", "node", node.Name, "rule", rule.Name) - // Continue with other rules even if one fails - r.recordNodeFailure(rule, node.Name, "EvaluationError", err.Error()) + r.recorder.Eventf(node, corev1.EventTypeWarning, "RuleEvaluationError", "Failed to evaluate rule for node: %v", err) } // Persist the rule status - log.V(4).Info("Attempting to persist rule status", + log.V(4).Info("Successfully evaluated rule for node", "node", node.Name, "rule", rule.Name, "resourceVersion", rule.ResourceVersion) - - err := retry.RetryOnConflict(retry.DefaultRetry, func() error { - latestRule := &readinessv1alpha1.NodeReadinessRule{} - if err := r.Get(ctx, client.ObjectKey{Name: rule.Name}, latestRule); err != nil { - return err - } - - patch := client.MergeFrom(latestRule.DeepCopy()) - - // update only this specific node evaluation status - currEval := readinessv1alpha1.NodeEvaluation{} - for _, eval := range rule.Status.NodeEvaluations { - if eval.NodeName == node.Name { - currEval = eval - break - } - } - - found := false - for i := range latestRule.Status.NodeEvaluations { - if latestRule.Status.NodeEvaluations[i].NodeName == node.Name { - latestRule.Status.NodeEvaluations[i] = currEval - found = true - break - } - } - if !found { - latestRule.Status.NodeEvaluations = append( - latestRule.Status.NodeEvaluations, - currEval, - ) - } - - // handle status.FailedNodes for this node - var updatedFailedNodes []readinessv1alpha1.NodeFailure - for _, failure := range latestRule.Status.FailedNodes { - if failure.NodeName != node.Name { - updatedFailedNodes = append(updatedFailedNodes, failure) - } - } - for _, failure := range rule.Status.FailedNodes { - if failure.NodeName == node.Name { - updatedFailedNodes = append(updatedFailedNodes, failure) - } - } - latestRule.Status.FailedNodes = updatedFailedNodes - - return r.Status().Patch(ctx, latestRule, patch) - }) - - if err != nil { - log.Error(err, "Failed to update rule status after node evaluation", - "node", node.Name, - "rule", rule.Name, - "resourceVersion", rule.ResourceVersion) - // continue with other rules - } else { - log.V(4).Info("Successfully persisted rule status from node reconciler", - "node", node.Name, - "rule", rule.Name, - "newResourceVersion", rule.ResourceVersion) - } } } @@ -314,27 +247,3 @@ func (r *RuleReadinessController) markBootstrapCompleted(ctx context.Context, no metrics.BootstrapCompleted.WithLabelValues(ruleName).Inc() } } - -// recordNodeFailure records a failure for a specific node. -func (r *RuleReadinessController) recordNodeFailure( - rule *readinessv1alpha1.NodeReadinessRule, - nodeName, reason, message string, -) { - // Remove any existing failure for this node - var failedNodes []readinessv1alpha1.NodeFailure - for _, failure := range rule.Status.FailedNodes { - if failure.NodeName != nodeName { - failedNodes = append(failedNodes, failure) - } - } - - // Add new failure - failedNodes = append(failedNodes, readinessv1alpha1.NodeFailure{ - NodeName: nodeName, - Reason: reason, - Message: message, - LastEvaluationTime: metav1.Now(), - }) - - rule.Status.FailedNodes = failedNodes -} diff --git a/internal/controller/node_controller_test.go b/internal/controller/node_controller_test.go index b746a25..ef480cf 100644 --- a/internal/controller/node_controller_test.go +++ b/internal/controller/node_controller_test.go @@ -480,33 +480,6 @@ var _ = Describe("Node Controller", func() { Expect(hasTaint).To(BeFalse(), "Taint should not be added when rule is being deleted") }) - - It("should skip rule evaluation completely when DeletionTimestamp is set", func() { - By("Creating rule with DeletionTimestamp") - deletingRule := rule.DeepCopy() - now := metav1.Now() - deletingRule.DeletionTimestamp = &now - readinessController.updateRuleCache(ctx, deletingRule) - - By("Triggering reconciliation") - _, err := nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: namespacedName}) - Expect(err).NotTo(HaveOccurred()) - - By("Verifying rule was not evaluated") - // Check that no NodeEvaluation was added for this node - checkRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ruleName}, checkRule)).To(Succeed()) - - hasEval := false - for _, eval := range checkRule.Status.NodeEvaluations { - if eval.NodeName == nodeName { - hasEval = true - break - } - } - Expect(hasEval).To(BeFalse(), - "Rule with DeletionTimestamp should not create node evaluation") - }) }) // Test for status updates @@ -514,11 +487,9 @@ var _ = Describe("Node Controller", func() { var ( ctx context.Context readinessController *RuleReadinessController - nodeReconciler *NodeReconciler fakeClientset *fake.Clientset node *corev1.Node rule *nodereadinessiov1alpha1.NodeReadinessRule - namespacedName types.NamespacedName ) BeforeEach(func() { @@ -532,14 +503,6 @@ var _ = Describe("Node Controller", func() { ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), } - nodeReconciler = &NodeReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), - Controller: readinessController, - } - - namespacedName = types.NamespacedName{Name: "status-test-node"} - node = &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "status-test-node", @@ -603,86 +566,5 @@ var _ = Describe("Node Controller", func() { readinessController.removeRuleFromCache(ctx, "status-test-rule") }) - - It("should persist NodeEvaluation with expected structure to rule.status", func() { - By("Triggering NodeReconciler") - _, err := nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: namespacedName}) - Expect(err).NotTo(HaveOccurred()) - - By("Verifying NodeEvaluation is persisted in rule status") - Eventually(func() bool { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: "status-test-rule"}, updatedRule); err != nil { - return false - } - - for _, eval := range updatedRule.Status.NodeEvaluations { - if eval.NodeName == "status-test-node" { - return true - } - } - return false - }, time.Second*5).Should(BeTrue(), "NodeEvaluation should be persisted for the node") - - By("Verifying NodeEvaluation has all expected fields") - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "status-test-rule"}, updatedRule)).To(Succeed()) - - var nodeEval *nodereadinessiov1alpha1.NodeEvaluation - for i := range updatedRule.Status.NodeEvaluations { - if updatedRule.Status.NodeEvaluations[i].NodeName == "status-test-node" { - nodeEval = &updatedRule.Status.NodeEvaluations[i] - break - } - } - - Expect(nodeEval).NotTo(BeNil(), "NodeEvaluation should exist") - Expect(nodeEval.ConditionResults).To(HaveLen(1), "Should have evaluation for 1 condition") - Expect(nodeEval.ConditionResults[0].Type).To(Equal("StatusTestCondition")) - Expect(nodeEval.ConditionResults[0].CurrentStatus).To(Equal(corev1.ConditionFalse)) - Expect(nodeEval.ConditionResults[0].RequiredStatus).To(Equal(corev1.ConditionTrue)) - Expect(nodeEval.TaintStatus).To(Equal(nodereadinessiov1alpha1.TaintStatusPresent)) - Expect(nodeEval.LastEvaluationTime.IsZero()).To(BeFalse(), "LastEvaluationTime should be set") - }) - - It("should update existing NodeEvaluation when node is re-evaluated", func() { - By("First reconciliation - create initial evaluation") - _, err := nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: namespacedName}) - Expect(err).NotTo(HaveOccurred()) - - Eventually(func() int { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: "status-test-rule"}, updatedRule); err != nil { - return 0 - } - return len(updatedRule.Status.NodeEvaluations) - }, time.Second*5).Should(Equal(1)) - - By("Updating node condition to satisfy rule") - updatedNode := &corev1.Node{} - Expect(k8sClient.Get(ctx, namespacedName, updatedNode)).To(Succeed()) - updatedNode.Status.Conditions[0].Status = corev1.ConditionTrue - Expect(k8sClient.Status().Update(ctx, updatedNode)).To(Succeed()) - - By("Second reconciliation - update existing evaluation") - _, err = nodeReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: namespacedName}) - Expect(err).NotTo(HaveOccurred()) - - By("Verifying NodeEvaluation was updated") - Eventually(func() bool { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: "status-test-rule"}, updatedRule); err != nil { - return false - } - - if len(updatedRule.Status.NodeEvaluations) != 1 { - return false - } - - nodeEval := updatedRule.Status.NodeEvaluations[0] - return nodeEval.ConditionResults[0].CurrentStatus == corev1.ConditionTrue && - nodeEval.TaintStatus == nodereadinessiov1alpha1.TaintStatusAbsent - }, time.Second*5).Should(BeTrue(), "NodeEvaluation should be updated with new condition and taint status") - }) }) }) diff --git a/internal/controller/nodereadinessrule_controller.go b/internal/controller/nodereadinessrule_controller.go index 50b3255..0ab3d99 100644 --- a/internal/controller/nodereadinessrule_controller.go +++ b/internal/controller/nodereadinessrule_controller.go @@ -30,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -56,6 +57,8 @@ type RuleReadinessController struct { // Cache for efficient rule lookup ruleCacheMutex sync.RWMutex ruleCache map[string]*readinessv1alpha1.NodeReadinessRule // ruleName -> rule + + recorder record.EventRecorder } // RuleReconciler handles NodeReadinessRule reconciliation. @@ -72,6 +75,7 @@ func NewRuleReadinessController(mgr ctrl.Manager, clientset kubernetes.Interface Scheme: mgr.GetScheme(), clientset: clientset, ruleCache: make(map[string]*readinessv1alpha1.NodeReadinessRule), + recorder: mgr.GetEventRecorderFor("node-readiness-controller"), } } @@ -87,6 +91,7 @@ func (r *RuleReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) // +kubebuilder:rbac:groups=readiness.node.x-k8s.io,resources=nodereadinessrules,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=readiness.node.x-k8s.io,resources=nodereadinessrules/status,verbs=get;update;patch // +kubebuilder:rbac:groups=readiness.node.x-k8s.io,resources=nodereadinessrules/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *RuleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx) @@ -157,12 +162,6 @@ func (r *RuleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. return ctrl.Result{RequeueAfter: time.Minute}, err } - // Clean up status for deleted nodes - if err := r.Controller.cleanupDeletedNodes(ctx, rule); err != nil { - log.Error(err, "Failed to clean up deleted nodes", "rule", rule.Name) - return ctrl.Result{RequeueAfter: time.Minute}, err - } - return ctrl.Result{}, nil } @@ -196,62 +195,6 @@ func (r *RuleReconciler) reconcileDelete(ctx context.Context, rule *readinessv1a return ctrl.Result{}, nil } -// cleanupDeletedNodes removes status entries for nodes that no longer exist. -func (r *RuleReadinessController) cleanupDeletedNodes(ctx context.Context, rule *readinessv1alpha1.NodeReadinessRule) error { - log := ctrl.LoggerFrom(ctx) - - nodeList := &corev1.NodeList{} - if err := r.List(ctx, nodeList); err != nil { - return err - } - - existingNodes := make(map[string]bool) - for _, node := range nodeList.Items { - existingNodes[node.Name] = true - } - - // Filter out deleted nodes - var newNodeEvaluations []readinessv1alpha1.NodeEvaluation - for _, evaluation := range rule.Status.NodeEvaluations { - if existingNodes[evaluation.NodeName] { - newNodeEvaluations = append(newNodeEvaluations, evaluation) - } - } - - if len(newNodeEvaluations) == len(rule.Status.NodeEvaluations) { - log.V(4).Info("No deleted nodes to clean up", "rule", rule.Name) - return nil - } - - log.V(4).Info("Cleaning up deleted nodes from rule status", - "rule", rule.Name, - "before", len(rule.Status.NodeEvaluations), - "after", len(newNodeEvaluations)) - - // Use retry on conflict to update status to avoid race conditions from node updates - return retry.RetryOnConflict(retry.DefaultRetry, func() error { - fresh := &readinessv1alpha1.NodeReadinessRule{} - if err := r.Get(ctx, client.ObjectKey{Name: rule.Name}, fresh); err != nil { - return err - } - - var freshNodeEvaluations []readinessv1alpha1.NodeEvaluation - for _, evaluation := range fresh.Status.NodeEvaluations { - if existingNodes[evaluation.NodeName] { - freshNodeEvaluations = append(freshNodeEvaluations, evaluation) - } - } - - if len(freshNodeEvaluations) == len(fresh.Status.NodeEvaluations) { - return nil - } - - patch := client.MergeFrom(fresh.DeepCopy()) - fresh.Status.NodeEvaluations = freshNodeEvaluations - return r.Status().Patch(ctx, fresh, patch) - }) -} - // processAllNodesForRule processes all nodes when a rule changes. func (r *RuleReadinessController) processAllNodesForRule(ctx context.Context, rule *readinessv1alpha1.NodeReadinessRule) error { log := ctrl.LoggerFrom(ctx) @@ -269,17 +212,19 @@ func (r *RuleReadinessController) processAllNodesForRule(ctx context.Context, ru appliedNodes = append(appliedNodes, node.Name) log.Info("Processing node for rule", "rule", rule.Name, "node", node.Name) if err := r.evaluateRuleForNode(ctx, rule, &node); err != nil { - // Log error but continue with other nodes - log.Error(err, "Failed to evaluate node for rule", "rule", rule.Name, "node", node.Name) - r.recordNodeFailure(rule, node.Name, "EvaluationError", err.Error()) + // Log error and record event but continue with other nodes. + log.Error(err, "Failed to evaluate node for rule", + "rule", rule.Name, "node", node.Name) + r.recorder.Eventf(&node, corev1.EventTypeWarning, "NodeReadinessRuleEvaluationError", "Failed to evaluate rule for node: %v", err) metrics.Failures.WithLabelValues(rule.Name, "EvaluationError").Inc() } + r.recorder.Eventf(&node, corev1.EventTypeNormal, "NodeReadinessRuleApplied", "Rule applied to Node %s", node.Name) } } // Update status rule.Status.ObservedGeneration = rule.Generation - rule.Status.AppliedNodes = appliedNodes + rule.Status.LastEvaluationTime = metav1.NewTime(time.Now()) if !rule.Spec.DryRun { rule.Status.DryRunResults = readinessv1alpha1.DryRunResults{} @@ -297,7 +242,6 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule // Evaluate all conditions (ALL logic) allConditionsSatisfied := true - conditionResults := make([]readinessv1alpha1.ConditionEvaluationResult, 0, len(rule.Spec.Conditions)) for _, condReq := range rule.Spec.Conditions { currentStatus := r.getConditionStatus(node, condReq.Type) @@ -307,12 +251,6 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule allConditionsSatisfied = false } - conditionResults = append(conditionResults, readinessv1alpha1.ConditionEvaluationResult{ - Type: condReq.Type, - CurrentStatus: currentStatus, - RequiredStatus: condReq.RequiredStatus, - }) - log.V(1).Info("Condition evaluation", "node", node.Name, "rule", rule.Name, "conditionType", condReq.Type, "current", currentStatus, "required", condReq.RequiredStatus, "satisfied", satisfied) @@ -356,49 +294,9 @@ func (r *RuleReadinessController) evaluateRuleForNode(ctx context.Context, rule "shouldRemove", shouldRemoveTaint, "hasTaint", currentlyHasTaint) } - // Determine observed taint status after any actions - var taintStatus readinessv1alpha1.TaintStatus - if r.hasTaintBySpec(node, rule.Spec.Taint) { - taintStatus = readinessv1alpha1.TaintStatusPresent - } else { - taintStatus = readinessv1alpha1.TaintStatusAbsent - } - - // Update evaluation status - r.updateNodeEvaluationStatus(rule, node.Name, conditionResults, taintStatus) - return nil } -// updateNodeEvaluationStatus updates the evaluation status for a specific node. -func (r *RuleReadinessController) updateNodeEvaluationStatus( - rule *readinessv1alpha1.NodeReadinessRule, - nodeName string, - conditionResults []readinessv1alpha1.ConditionEvaluationResult, - taintStatus readinessv1alpha1.TaintStatus, -) { - // Find existing evaluation or create new - var nodeEval *readinessv1alpha1.NodeEvaluation - for i := range rule.Status.NodeEvaluations { - if rule.Status.NodeEvaluations[i].NodeName == nodeName { - nodeEval = &rule.Status.NodeEvaluations[i] - break - } - } - - if nodeEval == nil { - rule.Status.NodeEvaluations = append(rule.Status.NodeEvaluations, readinessv1alpha1.NodeEvaluation{ - NodeName: nodeName, - }) - nodeEval = &rule.Status.NodeEvaluations[len(rule.Status.NodeEvaluations)-1] - } - - // Update evaluation - nodeEval.ConditionResults = conditionResults - nodeEval.TaintStatus = taintStatus - nodeEval.LastEvaluationTime = metav1.Now() -} - // getApplicableRulesForNode returns all rules applicable to a node. func (r *RuleReadinessController) getApplicableRulesForNode(ctx context.Context, node *corev1.Node) []*readinessv1alpha1.NodeReadinessRule { r.ruleCacheMutex.RLock() @@ -471,9 +369,7 @@ func (r *RuleReadinessController) updateRuleStatus(ctx context.Context, rule *re log := ctrl.LoggerFrom(ctx) log.V(1).Info("Updating rule status", - "rule", rule.Name, - "nodeEvaluations", len(rule.Status.NodeEvaluations), - "appliedNodes", len(rule.Status.AppliedNodes)) + "rule", rule.Name) return retry.RetryOnConflict(retry.DefaultRetry, func() error { latestRule := &readinessv1alpha1.NodeReadinessRule{} @@ -483,9 +379,6 @@ func (r *RuleReadinessController) updateRuleStatus(ctx context.Context, rule *re patch := client.MergeFrom(latestRule.DeepCopy()) - latestRule.Status.NodeEvaluations = rule.Status.NodeEvaluations - latestRule.Status.AppliedNodes = rule.Status.AppliedNodes - latestRule.Status.FailedNodes = rule.Status.FailedNodes latestRule.Status.ObservedGeneration = rule.Status.ObservedGeneration latestRule.Status.DryRunResults = rule.Status.DryRunResults diff --git a/internal/controller/nodereadinessrule_controller_test.go b/internal/controller/nodereadinessrule_controller_test.go index bd8a0b5..e9825a2 100644 --- a/internal/controller/nodereadinessrule_controller_test.go +++ b/internal/controller/nodereadinessrule_controller_test.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -46,6 +47,7 @@ var _ = Describe("NodeReadinessRule Controller", func() { nodeReconciler *NodeReconciler scheme *runtime.Scheme fakeClientset *fake.Clientset + recorder *record.FakeRecorder ) BeforeEach(func() { @@ -55,11 +57,13 @@ var _ = Describe("NodeReadinessRule Controller", func() { Expect(corev1.AddToScheme(scheme)).To(Succeed()) fakeClientset = fake.NewSimpleClientset() + recorder = record.NewFakeRecorder(32) readinessController = &RuleReadinessController{ Client: k8sClient, Scheme: scheme, clientset: fakeClientset, ruleCache: make(map[string]*nodereadinessiov1alpha1.NodeReadinessRule), + recorder: recorder, } ruleReconciler = &RuleReconciler{ @@ -273,15 +277,7 @@ var _ = Describe("NodeReadinessRule Controller", func() { return false }, time.Second*5).Should(BeTrue()) - // Verify rule status includes the node - Eventually(func() []string { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - err := k8sClient.Get(ctx, types.NamespacedName{Name: "immediate-test-rule"}, updatedRule) - if err != nil { - return nil - } - return updatedRule.Status.AppliedNodes - }, time.Second*5).Should(ContainElement("immediate-test-node")) + Expect(recorder.Events).To(Receive(ContainSubstring("immediate-test-node"))) }) It("should handle dry run mode", func() { @@ -570,12 +566,8 @@ var _ = Describe("NodeReadinessRule Controller", func() { return false }, time.Second*5).Should(BeTrue()) - // Verify the status of the rule - Eventually(func() []string { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - _ = k8sClient.Get(ctx, types.NamespacedName{Name: "db-rule"}, updatedRule) - return updatedRule.Status.AppliedNodes - }, time.Second*5).Should(ContainElement("node1")) + // Verify the rule applied to the node. + Expect(recorder.Events).To(Receive(ContainSubstring("node1"))) }) }) @@ -624,16 +616,6 @@ var _ = Describe("NodeReadinessRule Controller", func() { }) Expect(err).NotTo(HaveOccurred()) - // Verify that the rule's status is updated to include the new node - Eventually(func() []string { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - err := k8sClient.Get(ctx, types.NamespacedName{Name: "new-node-rule"}, updatedRule) - if err != nil { - return nil - } - return updatedRule.Status.AppliedNodes - }, time.Second*5, time.Millisecond*250).Should(ContainElement("new-node")) - // Verify that the new node gets tainted Eventually(func() bool { updatedNode := &corev1.Node{} @@ -785,49 +767,6 @@ var _ = Describe("NodeReadinessRule Controller", func() { // node1 is already deleted in the test _ = k8sClient.Delete(ctx, node2) }) - - It("should remove the node from the rule's status", func() { - // Initial reconcile to populate status - _, err := ruleReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: "delete-node-rule"}}) - Expect(err).NotTo(HaveOccurred()) - - Eventually(func() int { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - _ = k8sClient.Get(ctx, types.NamespacedName{Name: "delete-node-rule"}, updatedRule) - return len(updatedRule.Status.NodeEvaluations) - }, time.Second*5).Should(Equal(2)) - - // Delete node1 - Expect(k8sClient.Delete(ctx, node1)).To(Succeed()) - - // Reconcile again to trigger cleanup - _, err = ruleReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: "delete-node-rule"}}) - Expect(err).NotTo(HaveOccurred()) - - // Verify node1 is removed from status - Eventually(func() bool { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - _ = k8sClient.Get(ctx, types.NamespacedName{Name: "delete-node-rule"}, updatedRule) - for _, eval := range updatedRule.Status.NodeEvaluations { - if eval.NodeName == "node1" { - return false - } - } - return true - }, time.Second*5).Should(BeTrue()) - - // Verify node2 is still in status - Eventually(func() bool { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - _ = k8sClient.Get(ctx, types.NamespacedName{Name: "delete-node-rule"}, updatedRule) - for _, eval := range updatedRule.Status.NodeEvaluations { - if eval.NodeName == "node2" { - return true - } - } - return false - }, time.Second*5).Should(BeTrue()) - }) }) Context("when a rule's nodeSelector changes", func() { @@ -1188,54 +1127,5 @@ var _ = Describe("NodeReadinessRule Controller", func() { return apierrors.IsNotFound(err) }, time.Second*10).Should(BeTrue()) }) - - It("should list only nodes matching the selector in AppliedNodes", func() { - By("Running reconciliation") - _, err := ruleReconciler.Reconcile(ctx, reconcile.Request{ - NamespacedName: types.NamespacedName{Name: "applied-nodes-rule"}, - }) - Expect(err).NotTo(HaveOccurred()) - - By("Verifying AppliedNodes contains only matching nodes") - Eventually(func() []string { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - _ = k8sClient.Get(ctx, types.NamespacedName{Name: "applied-nodes-rule"}, updatedRule) - return updatedRule.Status.AppliedNodes - }, time.Second*5).Should(And( - ContainElement("applied-node-1"), - ContainElement("applied-node-2"), - Not(ContainElement("applied-node-3")), - ), "AppliedNodes should only contain nodes matching selector") - }) - - It("should have matching NodeEvaluations for all AppliedNodes", func() { - By("Running reconciliation") - _, err := ruleReconciler.Reconcile(ctx, reconcile.Request{ - NamespacedName: types.NamespacedName{Name: "applied-nodes-rule"}, - }) - Expect(err).NotTo(HaveOccurred()) - - By("Verifying NodeEvaluations exist for all AppliedNodes") - Eventually(func() bool { - updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: "applied-nodes-rule"}, updatedRule); err != nil { - return false - } - - for _, appliedNode := range updatedRule.Status.AppliedNodes { - found := false - for _, eval := range updatedRule.Status.NodeEvaluations { - if eval.NodeName == appliedNode { - found = true - break - } - } - if !found { - return false - } - } - return len(updatedRule.Status.AppliedNodes) > 0 - }, time.Second*5).Should(BeTrue(), "All AppliedNodes should have corresponding NodeEvaluations") - }) }) })