diff --git a/examples/npd-prober/README.md b/examples/npd-prober/README.md new file mode 100644 index 0000000..14d8130 --- /dev/null +++ b/examples/npd-prober/README.md @@ -0,0 +1,77 @@ +# NPD Prober — Custom Plugin for Node Problem Detector + +A lightweight Go binary that acts as a [node-problem-detector (NPD)](https://github.com/kubernetes/node-problem-detector) custom plugin. It performs HTTP or TCP probes using kubelet-style semantics and returns NPD-compatible exit codes. + +## How It Works + +``` +NPD executes npd-prober binary + │ + ▼ + Probe target (HTTP GET or TCP connect) + │ + ▼ + Exit code: 0=OK, 1=NonOK, 2=Unknown + │ + ▼ + NPD sets NodeCondition (e.g. ServiceReadiness=True/False) + │ + ▼ + Node Readiness Controller watches condition + │ + ▼ + NRC manages taint (e.g. readiness.k8s.io/ServiceReady) +``` + +## CLI Flags + +| Flag | Description | Default | +|------|-------------|---------| +| `--probe-type` | Probe type: `http` or `tcp` | (required) | +| `--http-url` | URL for HTTP probe | (required for `http`) | +| `--tcp-addr` | Address (`host:port`) for TCP probe | (required for `tcp`) | +| `--timeout` | Probe timeout | `5s` | + +## Exit Codes + +| Code | Meaning | NPD Interpretation | +|------|---------|-------------------| +| 0 | OK / Healthy | Condition transitions to healthy state | +| 1 | NonOK / Unhealthy | Condition transitions to unhealthy state | +| 2 | Unknown | Configuration error, condition unchanged | + +## Build + +```bash +go build -o npd-prober ./examples/npd-prober/ +``` + +## Usage + +HTTP probe: +```bash +./npd-prober --probe-type=http --http-url=http://localhost:8080/healthz +``` + +TCP probe: +```bash +./npd-prober --probe-type=tcp --tcp-addr=localhost:5432 --timeout=3s +``` + +## NPD Configuration + +See [`npd-config.json`](npd-config.json) for an example NPD custom plugin monitor configuration. Place it in your NPD config directory and ensure the prober binary is accessible at the configured path. + +## Node Readiness Controller Integration + +See [`node-readiness-rule.yaml`](node-readiness-rule.yaml) for an example `NodeReadinessRule` that watches the condition NPD sets and manages a taint accordingly: + +```bash +kubectl apply -f examples/npd-prober/node-readiness-rule.yaml +``` + +This creates a rule that: +1. Watches nodes for the `ServiceReadiness` condition (set by NPD via the prober) +2. Manages the `readiness.k8s.io/ServiceReady=pending:NoSchedule` taint +3. Removes the taint when the condition becomes `True` +4. Re-adds the taint when the condition becomes `False` (continuous enforcement) diff --git a/examples/npd-prober/kind-cluster.yaml b/examples/npd-prober/kind-cluster.yaml new file mode 100644 index 0000000..9c26f68 --- /dev/null +++ b/examples/npd-prober/kind-cluster.yaml @@ -0,0 +1,6 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + - role: worker + diff --git a/examples/npd-prober/main.go b/examples/npd-prober/main.go new file mode 100644 index 0000000..e849d6a --- /dev/null +++ b/examples/npd-prober/main.go @@ -0,0 +1,144 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// npd-prober is a lightweight binary that performs HTTP or TCP probes +// and returns NPD-compatible exit codes (0 for success, 1 for failure, +// and 2 for unknown). It is designed to be used as a custom plugin for +// node-problem-detector (NPD), allowing operatorsto reuse kubelet-style +// probe semantics for node-level readiness checks. + +// Exit codes follow NPD convention: +// +// 0 = OK (healthy) +// 1 = NonOK (unhealthy) +// 2 = Unknown (configuration error) + +package main + +import ( + "errors" + "flag" + "fmt" + "net" + "net/http" + "os" + "time" + + "k8s.io/klog/v2" +) + +// NPD custom plugin exit codes. +const ( + exitOK = 0 + exitNonOK = 1 + exitUnknown = 2 +) + +func main() { + probeType := flag.String("probe-type", "", "Probe type: http or tcp") + httpURL := flag.String("http-url", "", "URL for HTTP probe (required when probe-type=http)") + tcpAddr := flag.String("tcp-addr", "", "Address (host:port) for TCP probe (required when probe-type=tcp)") + timeout := flag.Duration("timeout", 5*time.Second, "Probe timeout") + allowNonLocalRedirects := flag.Bool("allow-non-local-redirects", false, + "Allow HTTP redirects to non-local hosts (default false, matching kubelet behavior)") + + klog.InitFlags(nil) + flag.Parse() + + code, msg := run(*probeType, *httpURL, *tcpAddr, *timeout, *allowNonLocalRedirects) + // Print to stdout for NPD capture (NPD reads stdout, not stderr where klog writes). + fmt.Println(msg) + if code == exitOK { + klog.InfoS("Probe completed", "result", msg, "exitCode", code) + } else { + klog.ErrorS(nil, "Probe completed", "result", msg, "exitCode", code) + } + os.Exit(code) +} + +// run executes the probe and returns an exit code and message. +func run(probeType, httpURL, tcpAddr string, timeout time.Duration, allowNonLocalRedirects bool) (int, string) { + switch probeType { + case "http": + return probeHTTP(httpURL, timeout, allowNonLocalRedirects) + case "tcp": + return probeTCP(tcpAddr, timeout) + default: + return exitUnknown, "unknown or missing --probe-type (must be http or tcp)" + } +} + +// redirectChecker returns a CheckRedirect function for http.Client. +// When allowNonLocal is false, redirects to a different host than the +// original request are blocked by returning http.ErrUseLastResponse, +// matching kubelet's default HTTP probe behavior. +func redirectChecker(allowNonLocal bool) func(*http.Request, []*http.Request) error { + return func(req *http.Request, via []*http.Request) error { + if len(via) >= 10 { + return errors.New("stopped after 10 redirects") + } + if !allowNonLocal && len(via) > 0 { + if req.URL.Hostname() != via[0].URL.Hostname() { + klog.InfoS("Blocked non-local redirect", + "from", via[0].URL.String(), + "to", req.URL.String()) + return http.ErrUseLastResponse + } + } + return nil + } +} + +// probeHTTP performs an HTTP GET and checks the response status code. +// Status 200-399 is healthy (matching kubelet HTTP probe semantics). +func probeHTTP(url string, timeout time.Duration, allowNonLocalRedirects bool) (int, string) { + if url == "" { + return exitUnknown, "missing --http-url for http probe" + } + + client := &http.Client{ + Timeout: timeout, + CheckRedirect: redirectChecker(allowNonLocalRedirects), + } + klog.InfoS("Starting HTTP probe", "url", url) + + resp, err := client.Get(url) //nolint:noctx // simple probe binary, context not needed + if err != nil { + return exitNonOK, fmt.Sprintf("http probe failed: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode >= http.StatusOK && resp.StatusCode < http.StatusBadRequest { + return exitOK, fmt.Sprintf("http probe healthy: status %d", resp.StatusCode) + } + return exitNonOK, fmt.Sprintf("http probe unhealthy: status %d", resp.StatusCode) +} + +// probeTCP attempts a TCP connection. Success means healthy. +func probeTCP(addr string, timeout time.Duration) (int, string) { + if addr == "" { + return exitUnknown, "missing --tcp-addr for tcp probe" + } + + klog.InfoS("Starting TCP probe", "addr", addr) + + conn, err := net.DialTimeout("tcp", addr, timeout) + if err != nil { + return exitNonOK, fmt.Sprintf("tcp probe failed: %v", err) + } + conn.Close() + return exitOK, fmt.Sprintf("tcp probe healthy: connected to %s", addr) +} diff --git a/examples/npd-prober/main_test.go b/examples/npd-prober/main_test.go new file mode 100644 index 0000000..e31a8d8 --- /dev/null +++ b/examples/npd-prober/main_test.go @@ -0,0 +1,275 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "net" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestProbeHTTP(t *testing.T) { + tests := []struct { + name string + handler http.HandlerFunc + url string // override URL (empty = use test server) + wantCode int + wantContains string + }{ + { + name: "200 OK", + handler: func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) }, + wantCode: exitOK, + wantContains: "healthy", + }, + { + name: "301 redirect (still healthy)", + handler: func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusMovedPermanently) }, + wantCode: exitOK, + wantContains: "healthy", + }, + { + name: "404 not found", + handler: func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusNotFound) }, + wantCode: exitNonOK, + wantContains: "unhealthy", + }, + { + name: "500 internal server error", + handler: func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusInternalServerError) }, + wantCode: exitNonOK, + wantContains: "unhealthy", + }, + { + name: "unreachable server", + url: "http://127.0.0.1:1", // port 1 is unlikely to be listening + wantCode: exitNonOK, + wantContains: "failed", + }, + { + name: "missing URL", + url: "", // explicitly empty + wantCode: exitUnknown, + wantContains: "missing --http-url", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + url := tt.url + if tt.handler != nil && url == "" { + // Disable redirect following so we can test 3xx codes directly. + ts := httptest.NewServer(tt.handler) + defer ts.Close() + url = ts.URL + } + + code, msg := probeHTTP(url, 2*time.Second, false) + if code != tt.wantCode { + t.Errorf("exit code = %d, want %d (msg: %s)", code, tt.wantCode, msg) + } + if !strings.Contains(msg, tt.wantContains) { + t.Errorf("message %q does not contain %q", msg, tt.wantContains) + } + }) + } +} + +func TestProbeTCP(t *testing.T) { + tests := []struct { + name string + setupServer bool // if true, start a TCP listener + addr string + wantCode int + wantContains string + }{ + { + name: "successful connection", + setupServer: true, + wantCode: exitOK, + wantContains: "healthy", + }, + { + name: "connection refused", + addr: "127.0.0.1:1", // port 1 is unlikely to be listening + wantCode: exitNonOK, + wantContains: "failed", + }, + { + name: "missing address", + addr: "", + wantCode: exitUnknown, + wantContains: "missing --tcp-addr", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + addr := tt.addr + if tt.setupServer { + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("failed to start TCP listener: %v", err) + } + defer ln.Close() + addr = ln.Addr().String() + } + + code, msg := probeTCP(addr, 2*time.Second) + if code != tt.wantCode { + t.Errorf("exit code = %d, want %d (msg: %s)", code, tt.wantCode, msg) + } + if !strings.Contains(msg, tt.wantContains) { + t.Errorf("message %q does not contain %q", msg, tt.wantContains) + } + }) + } +} + +func TestRun(t *testing.T) { + // Start an HTTP server for the http probe test. + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer ts.Close() + + // Start a TCP listener for the tcp probe test. + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("failed to start TCP listener: %v", err) + } + defer ln.Close() + + tests := []struct { + name string + probeType string + httpURL string + tcpAddr string + wantCode int + wantContains string + }{ + { + name: "http probe via run", + probeType: "http", + httpURL: ts.URL, + wantCode: exitOK, + wantContains: "healthy", + }, + { + name: "tcp probe via run", + probeType: "tcp", + tcpAddr: ln.Addr().String(), + wantCode: exitOK, + wantContains: "healthy", + }, + { + name: "invalid probe type", + probeType: "grpc", + wantCode: exitUnknown, + wantContains: "unknown or missing --probe-type", + }, + { + name: "empty probe type", + probeType: "", + wantCode: exitUnknown, + wantContains: "unknown or missing --probe-type", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + code, msg := run(tt.probeType, tt.httpURL, tt.tcpAddr, 2*time.Second, false) + if code != tt.wantCode { + t.Errorf("exit code = %d, want %d (msg: %s)", code, tt.wantCode, msg) + } + if !strings.Contains(msg, tt.wantContains) { + t.Errorf("message %q does not contain %q", msg, tt.wantContains) + } + }) + } +} + +func TestHTTPProbeRedirect(t *testing.T) { + tests := []struct { + name string + allowNonLocalRedirects bool + handler http.HandlerFunc + wantCode int + wantContains string + }{ + { + name: "redirect to same host (local) is followed", + allowNonLocalRedirects: false, + handler: func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/target" { + w.WriteHeader(http.StatusOK) + return + } + // Redirect to the same host (local redirect). + http.Redirect(w, r, "/target", http.StatusFound) + }, + wantCode: exitOK, + wantContains: "healthy: status 200", + }, + { + name: "redirect to non-local host is blocked by default", + allowNonLocalRedirects: false, + handler: func(w http.ResponseWriter, _ *http.Request) { + // Redirect to a different host — should be blocked. + w.Header().Set("Location", "http://198.51.100.1/other") + w.WriteHeader(http.StatusMovedPermanently) + }, + // The 301 response is used as-is; 301 is in [200,400) → healthy. + wantCode: exitOK, + wantContains: "healthy: status 301", + }, + { + name: "redirect to non-local host allowed with flag", + allowNonLocalRedirects: true, + handler: func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/target" { + w.WriteHeader(http.StatusOK) + return + } + // Redirect to the same server but via an absolute URL. + // With allowNonLocalRedirects=true, the client follows it. + http.Redirect(w, r, fmt.Sprintf("http://%s/target", r.Host), http.StatusFound) + }, + wantCode: exitOK, + wantContains: "healthy: status 200", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ts := httptest.NewServer(tt.handler) + defer ts.Close() + + code, msg := probeHTTP(ts.URL, 2*time.Second, tt.allowNonLocalRedirects) + if code != tt.wantCode { + t.Errorf("exit code = %d, want %d (msg: %s)", code, tt.wantCode, msg) + } + if !strings.Contains(msg, tt.wantContains) { + t.Errorf("message %q does not contain %q", msg, tt.wantContains) + } + }) + } +} diff --git a/examples/npd-prober/node-readiness-rule.yaml b/examples/npd-prober/node-readiness-rule.yaml new file mode 100644 index 0000000..852abdb --- /dev/null +++ b/examples/npd-prober/node-readiness-rule.yaml @@ -0,0 +1,17 @@ +apiVersion: readiness.node.x-k8s.io/v1alpha1 +kind: NodeReadinessRule +metadata: + name: service-readiness-rule +spec: + conditions: + - type: "ServiceReadiness" + requiredStatus: "False" + taint: + key: "readiness.k8s.io/ServiceReady" + effect: "NoSchedule" + value: "pending" + enforcementMode: "continuous" + nodeSelector: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist diff --git a/examples/npd-prober/npd-config.json b/examples/npd-prober/npd-config.json new file mode 100644 index 0000000..d58e53a --- /dev/null +++ b/examples/npd-prober/npd-config.json @@ -0,0 +1,29 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "30s", + "timeout": "5s", + "max_output_length": 80, + "concurrency": 1, + "enable_message_change_based_condition_update": false + }, + "source": "readiness-prober-custom-plugin-monitor", + "metricsReporting": true, + "conditions": [ + { + "type": "ServiceReadiness", + "reason": "ServiceIsReady", + "message": "service readiness probe is passing" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "ServiceReadiness", + "reason": "ServiceNotReady", + "path": "/home/kubernetes/bin/npd-prober", + "args": ["--probe-type=http", "--http-url=http://localhost:8080/healthz", "--timeout=5s"], + "timeout": "5s" + } + ] +} diff --git a/examples/npd-prober/testing-npd.md b/examples/npd-prober/testing-npd.md new file mode 100644 index 0000000..63c9e15 --- /dev/null +++ b/examples/npd-prober/testing-npd.md @@ -0,0 +1,266 @@ +# Testing NPD Prober on a Kind Cluster + +## How `npd-config.json` maps to `node-readiness-rule.yaml` + +The connection point is the **NodeCondition type** — `ServiceReadiness`. + +**On the Node-Problem-Detector (NPD) side (`npd-config.json`):** +- The `conditions` array declares a condition with `"type": "ServiceReadiness"` — this is the NodeCondition NPD will manage on each node. +- The `rules` array has a permanent rule with `"condition": "ServiceReadiness"` — when the prober exits with `1` (NonOK), NPD sets `ServiceReadiness=True` (problem present) with reason `ServiceNotReady`. When it exits with `0` (OK), NPD sets `ServiceReadiness=False` (no problem) with reason `ServiceIsReady`. + +**On the Node-Readiness-Controller (NRC) side (`node-readiness-rule.yaml`):** +- `spec.conditions[0].type: "ServiceReadiness"` — watches the exact same condition NPD sets. +- `spec.conditions[0].requiredStatus: "False"` — the taint is removed when this condition is `False` (no problem). +- `spec.taint` — defines what taint to manage based on that condition state. + +> **Important:** NPD conditions represent **problems**, not health. Exit code 0 (OK) sets the +> condition to `False` (problem absent), while exit code 1 (NonOK) sets it to `True` (problem +> present). This is why `requiredStatus` is `"False"` — the node is ready when the problem +> condition is not active. + +``` +npd-config.json node-readiness-rule.yaml +───────────────── ──────────────────────── +conditions[].type: "ServiceReadiness" ──► conditions[].type: "ServiceReadiness" + conditions[].requiredStatus: "False" + +rules[].reason: "ServiceNotReady" (exit 1 → NPD sets condition True + → NRC sees True ≠ False → taint applied) + +conditions[].reason: "ServiceIsReady" (exit 0 → NPD sets condition False + → NRC sees False = False → taint removed) +``` + +## 1. Build the prober binary (Linux) + +```bash +git clone https://github.com/kubernetes-sigs/node-readiness-controller.git +cd node-readiness-controller +# Build the npd-prober +GOOS=linux GOARCH=amd64 go build -o npd-prober ./examples/npd-prober/ +``` + +## 2. Create a Kind cluster with a worker node + +The `NodeReadinessRule` targets non-control-plane nodes (`node-role.kubernetes.io/control-plane DoesNotExist`), +so we need a worker node. Create a Kind config: + +```bash +cat <<'EOF' > /tmp/kind-npd-prober.yaml +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + - role: worker +EOF + +kind create cluster --name npd-prober-test --config /tmp/kind-npd-prober.yaml +``` + +## 3. Deploy a sample workload to probe + +Create a simple health endpoint pod scheduled on the worker node. This application will +acknowledge the probes emitted by the npd-prober on the node: + +```bash +kubectl run healthz-server --image=registry.k8s.io/e2e-test-images/agnhost:2.39 \ + --command -- /agnhost serve-hostname --port 8080 +kubectl expose pod healthz-server --port=8080 +``` + +> **Note:** The pod will land on the worker node by default since control-plane nodes +> have a `NoSchedule` taint. Verify with `kubectl get pod healthz-server -o wide`. + +## 4. Install NPD with the custom plugin + +```bash +# Pull the NPD image and load it into Kind: +docker pull registry.k8s.io/node-problem-detector/node-problem-detector:v1.35.2 +kind load docker-image registry.k8s.io/node-problem-detector/node-problem-detector:v1.35.2 --name npd-prober-test +``` + +Create a ConfigMap with the prober config and mount it + the binary into NPD. Save this as `npd-deploy.yaml`: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: npd-prober-config + namespace: kube-system +data: + readiness-prober.json: | + { + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "5s", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "readiness-prober-custom-plugin-monitor", + "conditions": [ + { + "type": "ServiceReadiness", + "reason": "ServiceIsReady", + "message": "service readiness probe is passing" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "ServiceReadiness", + "reason": "ServiceNotReady", + "path": "/custom-plugins/npd-prober", + "args": ["--probe-type=http", "--http-url=http://healthz-server.default.svc.cluster.local:8080", "--timeout=5s"], + "timeout": "5s" + } + ] + } +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector + namespace: kube-system +spec: + selector: + matchLabels: + app: node-problem-detector + template: + metadata: + labels: + app: node-problem-detector + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + serviceAccountName: node-problem-detector + containers: + - name: npd + image: registry.k8s.io/node-problem-detector/node-problem-detector:v1.35.2 + command: ["/node-problem-detector"] + args: + - "--logtostderr" + - "--custom-plugin-monitors=/config/readiness-prober.json" + volumeMounts: + - name: config + mountPath: /config + - name: custom-plugins + mountPath: /custom-plugins + securityContext: + privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumes: + - name: config + configMap: + name: npd-prober-config + - name: custom-plugins + hostPath: + path: /opt/npd-prober +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-problem-detector + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-problem-detector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:node-problem-detector +subjects: + - kind: ServiceAccount + name: node-problem-detector + namespace: kube-system +``` + +Copy the prober binary into all Kind nodes, then deploy NPD: + +```bash +# Copy binary into each Kind node +for NODE in $(kind get nodes --name npd-prober-test); do + docker exec "${NODE}" mkdir -p /opt/npd-prober + docker cp npd-prober "${NODE}:/opt/npd-prober/npd-prober" + docker exec "${NODE}" chmod +x /opt/npd-prober/npd-prober +done + +# Deploy NPD +kubectl apply -f npd-deploy.yaml +``` + +## 5. Verify NPD sets the condition + +```bash +# Wait a few seconds, then check: +kubectl get node -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{range .status.conditions[*]} {.type}={.status}{"\n"}{end}{end}' +``` + +You should see `ServiceReadiness=False` (probe healthy, no problem) or `ServiceReadiness=True` (probe failing, problem active) in the list. + +## 6. Install the Node Readiness Controller and apply the rule + +```bash +cd node-readiness-controller +make install +make run + +``` + +On a different terminal apply the NodeReadinessRule config. +```bash +kubectl apply -f examples/npd-prober/node-readiness-rule.yaml +``` + +## 7. Verify end-to-end + +```bash +# Check taints — should have no ServiceReady taint if probe is healthy: +kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: {.spec.taints}{"\n"}{end}' + +# Check the rule status: +kubectl get nrr service-readiness-rule -o yaml +``` + +## 8. Simulate failure and recovery + +Misconfigure the service port so the probe can no longer reach the health endpoint. +The pod keeps running — only the service routing is broken: + +```bash +# Point the service at a wrong targetPort (pod listens on 8080, not 9999) +kubectl patch svc healthz-server --type='json' \ + -p='[{"op":"replace","path":"/spec/ports/0/targetPort","value":9999}]' +``` + +After ~10s (NPD invoke interval), the condition should flip to `True` (problem active) and the taint `readiness.k8s.io/ServiceReady:pending:NoSchedule`should appear: + +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints +``` + +Now fix the service port to verify recovery: + +```bash +# Restore the correct targetPort +kubectl patch svc healthz-server --type='json' \ + -p='[{"op":"replace","path":"/spec/ports/0/targetPort","value":8080}]' +``` + +After ~10s, the probe should succeed again, the condition flips to `False`, and the taint is removed: + +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints +``` + +## 9. Cleanup + +```bash +kind delete cluster --name npd-prober-test +```