From cf28cc4453d88cecf8b5ffc3835be4bc5eaeb740 Mon Sep 17 00:00:00 2001 From: Kyungho Kang Date: Wed, 24 Sep 2025 17:29:34 +0900 Subject: [PATCH 1/4] feat: Add node-not-ready event monitoring support - Add 'node-not-ready' event type to EventConfiguration enum - Implement mapNodeEventType function in event watcher - Update event type filtering to handle Node events separately from Pod events - Update CRD schemas to include node-not-ready event type - Update documentation and examples with node monitoring capabilities - Generate updated deepcopy code for API changes This enables khook to monitor Kubernetes node readiness events and trigger appropriate agent responses for node-level issues like kubelet failures, network problems, or resource pressure. Signed-off-by: Kyungho Kang --- .gitignore | 2 +- README.md | 18 ++++++++- api/v1alpha2/hook_types.go | 24 ++++++------ api/v1alpha2/zz_generated.deepcopy.go | 43 ++++++++++++++++++++++ config/crd/bases/kagent.dev_hooks.yaml | 1 + helm/khook-crds/crds/kagent.dev_hooks.yaml | 1 + internal/event/watcher.go | 32 +++++++++++++--- 7 files changed, 103 insertions(+), 18 deletions(-) create mode 100644 api/v1alpha2/zz_generated.deepcopy.go diff --git a/.gitignore b/.gitignore index 3bca95b..6ebe080 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,4 @@ helm/*/requirements.lock # Generated files *.pb.go -zz_generated.*.go \ No newline at end of file +zz_generated.*.gobuild-multiarch.sh diff --git a/README.md b/README.md index 6e8ff6b..69a5c32 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ The KAgent Hook Controller monitors Kubernetes events and triggers Kagent agents ### Key Features -- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed) in a single hook configuration +- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready) in a single hook configuration - **Basic Deduplication**: Prevents duplicate notifications with 10-minute timeout logic - **Kagent Integration**: Integrates with the Kagent platform for AI agent incident response. (Can in theory talk to any a2a-enabled agent) - **Status Tracking**: Provides real-time status updates and audit trails through Kubernetes events @@ -137,6 +137,7 @@ The controller supports monitoring the following Kubernetes event types: | `pod-pending` | Pod is stuck in pending state | Resource constraints, scheduling issues, image pull failures | | `oom-kill` | Pod was killed due to out-of-memory | Memory limits exceeded, memory leaks | | `probe-failed` | Liveness or readiness probe failed | Application not responding, configuration issues | +| `node-not-ready` | Node is not in Ready state | Kubelet issues, network problems, resource pressure, hardware failures | ## Future The controller will support reacting to additional Kubernetes event. @@ -259,6 +260,21 @@ spec: After analysis - use all available tools to try and resolve. Annotate the updated resources with "kagentFix: " - If a resource can't be patched - delete it and recreate as needed. Don't ask for permission. Assume autonomous execution. Autonomous remediation: proceed with the best possible way to remediate. Don't ask for approval. + - eventType: node-not-ready + agentId: node-recovery-specialist + prompt: | + CRITICAL: Node {{.ResourceName}} is not ready at {{.EventTime}}. + This is a cluster-level emergency requiring immediate attention. + + AUTONOMOUS MODE: Diagnose and resolve node issues immediately: + • Check node conditions (Ready, MemoryPressure, DiskPressure, PIDPressure) + • Analyze kubelet logs and system resources + • Verify network connectivity and DNS resolution + • Check for hardware or kernel issues + • Attempt node recovery procedures (restart kubelet, clear disk space, etc.) + • If node cannot be recovered, safely drain and replace + • Never ask for permission - assume autonomous execution + • Annotate fixed resources with: kagentFix= ``` ### Development Environment Monitoring diff --git a/api/v1alpha2/hook_types.go b/api/v1alpha2/hook_types.go index 8c64713..904356f 100644 --- a/api/v1alpha2/hook_types.go +++ b/api/v1alpha2/hook_types.go @@ -25,7 +25,7 @@ type HookSpec struct { // EventConfiguration defines a single event type configuration type EventConfiguration struct { // EventType specifies the type of Kubernetes event to monitor - // +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed + // +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed;node-not-ready // +kubebuilder:validation:Required EventType string `json:"eventType"` @@ -84,14 +84,15 @@ func (h *Hook) Validate() error { func (h *Hook) validateEventConfiguration(config EventConfiguration, index int) error { // Validate EventType validEventTypes := map[string]bool{ - "pod-restart": true, - "pod-pending": true, - "oom-kill": true, - "probe-failed": true, + "pod-restart": true, + "pod-pending": true, + "oom-kill": true, + "probe-failed": true, + "node-not-ready": true, } if !validEventTypes[config.EventType] { - return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", index, config.EventType) + return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", index, config.EventType) } // Validate AgentRef @@ -390,7 +391,7 @@ func validateHook(hook *Hook) (admission.Warnings, error) { // Validate event type if !isValidEventType(config.EventType) { - allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", i, config.EventType)) + allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", i, config.EventType)) } // Validate agentId is not empty @@ -419,10 +420,11 @@ func validateHook(hook *Hook) (admission.Warnings, error) { // isValidEventType checks if the provided event type is valid func isValidEventType(eventType string) bool { validTypes := map[string]bool{ - "pod-restart": true, - "pod-pending": true, - "oom-kill": true, - "probe-failed": true, + "pod-restart": true, + "pod-pending": true, + "oom-kill": true, + "probe-failed": true, + "node-not-ready": true, } return validTypes[eventType] } diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go new file mode 100644 index 0000000..23c300f --- /dev/null +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -0,0 +1,43 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha2 + +import () + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ObjectReference) DeepCopyInto(out *ObjectReference) { + *out = *in + if in.Namespace != nil { + in, out := &in.Namespace, &out.Namespace + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ObjectReference. +func (in *ObjectReference) DeepCopy() *ObjectReference { + if in == nil { + return nil + } + out := new(ObjectReference) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/bases/kagent.dev_hooks.yaml b/config/crd/bases/kagent.dev_hooks.yaml index de09607..0230cd0 100644 --- a/config/crd/bases/kagent.dev_hooks.yaml +++ b/config/crd/bases/kagent.dev_hooks.yaml @@ -71,6 +71,7 @@ spec: - pod-pending - oom-kill - probe-failed + - node-not-ready type: string prompt: description: Prompt specifies the prompt template to send to diff --git a/helm/khook-crds/crds/kagent.dev_hooks.yaml b/helm/khook-crds/crds/kagent.dev_hooks.yaml index de09607..0230cd0 100644 --- a/helm/khook-crds/crds/kagent.dev_hooks.yaml +++ b/helm/khook-crds/crds/kagent.dev_hooks.yaml @@ -71,6 +71,7 @@ spec: - pod-pending - oom-kill - probe-failed + - node-not-ready type: string prompt: description: Prompt specifies the prompt template to send to diff --git a/internal/event/watcher.go b/internal/event/watcher.go index e5b4807..457bd14 100644 --- a/internal/event/watcher.go +++ b/internal/event/watcher.go @@ -251,14 +251,18 @@ func (w *Watcher) mapKubernetesEvent(k8sEvent *eventsv1.Event) *interfaces.Event // mapEventType maps Kubernetes event reasons to our event types func (w *Watcher) mapEventType(k8sEvent *eventsv1.Event) string { - // Ignore Normal events entirely; only act on warnings/errors - if strings.ToLower(k8sEvent.Type) == "normal" { - return "" - } - // Map based on the regarding object kind and event reason + // Map based on the regarding object kind and event reason first switch k8sEvent.Regarding.Kind { case "Pod": + // For pods, ignore Normal events entirely; only act on warnings/errors + if strings.ToLower(k8sEvent.Type) == "normal" { + return "" + } return w.mapPodEventType(k8sEvent) + case "Node": + // For nodes, we need to check both Normal and Warning events + // as NodeNotReady events are typically Normal type + return w.mapNodeEventType(k8sEvent) default: return "" } @@ -317,3 +321,21 @@ func (w *Watcher) mapPodEventType(k8sEvent *eventsv1.Event) string { return "" } + +// mapNodeEventType maps node-related events to our event types +func (w *Watcher) mapNodeEventType(k8sEvent *eventsv1.Event) string { + reason := strings.ToLower(k8sEvent.Reason) + message := strings.ToLower(k8sEvent.Note) + eventType := strings.ToLower(k8sEvent.Type) + + switch { + // Node not ready events + case reason == "nodenotready": + return "node-not-ready" + + default: + // Log unknown node events for future enhancement + w.logger.V(1).Info("Unknown node event", "reason", reason, "type", eventType, "message", message) + return "" + } +} From ba5d960cb3608eaf7036793dfd87f92e67f32bd6 Mon Sep 17 00:00:00 2001 From: kyungho-for-ops Date: Thu, 16 Oct 2025 03:58:14 +0900 Subject: [PATCH 2/4] Update .gitignore Signed-off-by: kyungho-for-ops Signed-off-by: Kyungho Kang --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6ebe080..b608159 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,4 @@ helm/*/requirements.lock # Generated files *.pb.go -zz_generated.*.gobuild-multiarch.sh +zz_generated.*.gobuild From 77cbe2ab0b1f50be9fe1050aed41c46ebbc366bf Mon Sep 17 00:00:00 2001 From: kyungho-for-ops Date: Thu, 16 Oct 2025 03:58:36 +0900 Subject: [PATCH 3/4] Update .gitignore Signed-off-by: kyungho-for-ops Signed-off-by: Kyungho Kang --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b608159..6ef23aa 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,4 @@ helm/*/requirements.lock # Generated files *.pb.go -zz_generated.*.gobuild +zz_generated.*.go From b4351846727acaea5ba819e49108465e3e0ecd86 Mon Sep 17 00:00:00 2001 From: Kyungho Kang Date: Wed, 15 Oct 2025 21:08:17 +0200 Subject: [PATCH 4/4] Trigger final checks update Signed-off-by: Kyungho Kang