From fb1e6fcc5ea872e5aa5c31f17bcfae60436d78ae Mon Sep 17 00:00:00 2001 From: Priyanka Saggu Date: Sun, 8 Mar 2026 01:18:18 +0530 Subject: [PATCH 1/3] add NRR sidecar variant with fixes --- .../add-falco-toleration.sh | 32 ++++ .../security-agent-readiness/apply-falco.sh | 65 -------- .../falco-rbac-node-status-rbac.yaml | 4 +- .../security-agent-patcher-sidecar.yaml | 0 .../security-agent-readiness-dryrun-rule.yaml | 0 .../security-agent-readiness-rule.yaml | 2 +- .../security-agent-readiness/setup-falco.sh | 151 ++++++++++++++++++ 7 files changed, 186 insertions(+), 68 deletions(-) create mode 100755 examples/security-agent-readiness/add-falco-toleration.sh delete mode 100644 examples/security-agent-readiness/apply-falco.sh rename examples/security-agent-readiness/{ => nrr-variant}/falco-rbac-node-status-rbac.yaml (92%) rename examples/security-agent-readiness/{ => nrr-variant}/security-agent-patcher-sidecar.yaml (100%) rename examples/security-agent-readiness/{ => nrr-variant}/security-agent-readiness-dryrun-rule.yaml (100%) rename examples/security-agent-readiness/{ => nrr-variant}/security-agent-readiness-rule.yaml (87%) create mode 100755 examples/security-agent-readiness/setup-falco.sh diff --git a/examples/security-agent-readiness/add-falco-toleration.sh b/examples/security-agent-readiness/add-falco-toleration.sh new file mode 100755 index 0000000..80b3a5a --- /dev/null +++ b/examples/security-agent-readiness/add-falco-toleration.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +echo "=== Adding toleration to Falco DaemonSet ===" +echo "This allows Falco pods to start on nodes with the security-agent-ready taint" + +kubectl patch daemonset falco -n falco --type='json' -p='[ + { + "op": "add", + "path": "/spec/template/spec/tolerations/-", + "value": { + "key": "readiness.k8s.io/security-agent-ready", + "operator": "Exists", + "effect": "NoSchedule" + } + } +]' diff --git a/examples/security-agent-readiness/apply-falco.sh b/examples/security-agent-readiness/apply-falco.sh deleted file mode 100644 index 770f04b..0000000 --- a/examples/security-agent-readiness/apply-falco.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# Copyright The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -KUBECTL_ARGS="$@" - -YQ_VERSION="v4.48.1" -YQ_PATH="/tmp/yq" - -# Check if yq is installed, if not download it. -if [ ! -f "$YQ_PATH" ]; then - echo "yq not found at $YQ_PATH, downloading..." - OS=$(uname -s | tr '[:upper:]' '[:lower:]') - ARCH=$(uname -m) - case $ARCH in - x86_64) - ARCH="amd64" - ;; - aarch64|arm64) - ARCH="arm64" - ;; - *) - echo "Unsupported architecture: $ARCH" - exit 1 - ;; - esac - YQ_BINARY="yq_${OS}_${ARCH}" - curl -sL "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${YQ_BINARY}" -o "$YQ_PATH" - chmod +x "$YQ_PATH" -fi - -# Add the Falco Helm repository -helm repo add falcosecurity https://falcosecurity.github.io/charts -helm repo update - -# Generate the Falco manifest -helm template falco falcosecurity/falco --namespace falco --set tty=true > falco.yaml - -# Add the security-status-patcher sidecar -"$YQ_PATH" e -i \ - 'select(.kind == "DaemonSet" and .metadata.name == "falco") - .spec.template.spec.containers += - [load("hack/test-workloads/security-patcher-sidecar.yaml")]' falco.yaml - -# Apply the manifest twice. The first time, it will create the CRDs and ServiceAccounts. -# The second time, it will create the rest of the resources, which should now be able to find the ServiceAccount. -kubectl apply $KUBECTL_ARGS -f falco.yaml || true -kubectl apply $KUBECTL_ARGS -f falco.yaml - -# Apply the RBAC rules -kubectl apply $KUBECTL_ARGS -f ./falco-rbac-node-status-patch-role.yaml diff --git a/examples/security-agent-readiness/falco-rbac-node-status-rbac.yaml b/examples/security-agent-readiness/nrr-variant/falco-rbac-node-status-rbac.yaml similarity index 92% rename from examples/security-agent-readiness/falco-rbac-node-status-rbac.yaml rename to examples/security-agent-readiness/nrr-variant/falco-rbac-node-status-rbac.yaml index a2f1f8f..3ffeed3 100644 --- a/examples/security-agent-readiness/falco-rbac-node-status-rbac.yaml +++ b/examples/security-agent-readiness/nrr-variant/falco-rbac-node-status-rbac.yaml @@ -21,5 +21,5 @@ roleRef: subjects: # Bind to security agent's ServiceAccount - kind: ServiceAccount - name: falco-node - namespace: kube-system \ No newline at end of file + name: falco + namespace: falco \ No newline at end of file diff --git a/examples/security-agent-readiness/security-agent-patcher-sidecar.yaml b/examples/security-agent-readiness/nrr-variant/security-agent-patcher-sidecar.yaml similarity index 100% rename from examples/security-agent-readiness/security-agent-patcher-sidecar.yaml rename to examples/security-agent-readiness/nrr-variant/security-agent-patcher-sidecar.yaml diff --git a/examples/security-agent-readiness/security-agent-readiness-dryrun-rule.yaml b/examples/security-agent-readiness/nrr-variant/security-agent-readiness-dryrun-rule.yaml similarity index 100% rename from examples/security-agent-readiness/security-agent-readiness-dryrun-rule.yaml rename to examples/security-agent-readiness/nrr-variant/security-agent-readiness-dryrun-rule.yaml diff --git a/examples/security-agent-readiness/security-agent-readiness-rule.yaml b/examples/security-agent-readiness/nrr-variant/security-agent-readiness-rule.yaml similarity index 87% rename from examples/security-agent-readiness/security-agent-readiness-rule.yaml rename to examples/security-agent-readiness/nrr-variant/security-agent-readiness-rule.yaml index 9b27b69..027a272 100644 --- a/examples/security-agent-readiness/security-agent-readiness-rule.yaml +++ b/examples/security-agent-readiness/nrr-variant/security-agent-readiness-rule.yaml @@ -7,7 +7,7 @@ spec: - type: "falco.org/FalcoReady" requiredStatus: "True" taint: - key: "readiness.k8s.io/falco.org/security-agent-ready" + key: "readiness.k8s.io/security-agent-ready" effect: "NoSchedule" value: "pending" enforcementMode: "continuous" diff --git a/examples/security-agent-readiness/setup-falco.sh b/examples/security-agent-readiness/setup-falco.sh new file mode 100755 index 0000000..c4170cb --- /dev/null +++ b/examples/security-agent-readiness/setup-falco.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Copyright The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [ -z "$USE_NPD" ] && [ -z "$USE_NRR" ]; then + echo "Error: Must set either USE_NPD=true or USE_NRR=true" + echo "" + echo "Usage:" + echo " USE_NPD=true $SCRIPT_DIR/setup-falco.sh # Deploy with Node Problem Detector (NPD)" + echo " USE_NRR=true $SCRIPT_DIR/setup-falco.sh # Deploy with Node Readiness Reporter (NRR)" + exit 1 +fi + +if [ "$USE_NPD" = "true" ] && [ "$USE_NRR" = "true" ]; then + echo "Error: Cannot set both USE_NPD and USE_NRR to true" + exit 1 +fi + +KUBECTL_ARGS="$@" + +TEMP_DIR=$(mktemp -d) +trap 'rm -rf "$TEMP_DIR"' EXIT + +download_yq_if_needed() { + YQ_VERSION="v4.48.1" + YQ_PATH="$TEMP_DIR/yq" + + if [ ! -f "$YQ_PATH" ]; then + echo "Downloading yq..." + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m) + case $ARCH in + x86_64) + ARCH="amd64" + ;; + aarch64|arm64) + ARCH="arm64" + ;; + *) + echo "Unsupported architecture: $ARCH" + exit 1 + ;; + esac + YQ_BINARY="yq_${OS}_${ARCH}" + curl -sL "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${YQ_BINARY}" -o "$YQ_PATH" + chmod +x "$YQ_PATH" + fi +} + +echo "=== Installing Falco ===" +if [ "$USE_NPD" = "true" ]; then + echo "Mode: Node Problem Detector (NPD) monitoring" +else + echo "Mode: Node Readiness Reporter (NRR) sidecar" +fi + +echo "Creating falco namespace..." +kubectl create namespace falco --dry-run=client -o yaml | kubectl apply -f - + +echo "Adding Falco Helm repository..." +helm repo add falcosecurity https://falcosecurity.github.io/charts +helm repo update + +echo "Generating Falco manifests..." +helm template falco falcosecurity/falco \ + --namespace falco \ + --set tty=true \ + --set falco.webserver.enabled=true \ + --set falco.webserver.listen_port=8765 > "$TEMP_DIR/falco.yaml" + +download_yq_if_needed + +if [ "$USE_NPD" = "true" ]; then + # For NPD mode, Falco needs hostNetwork to be reachable from NPD + # Helm chart doesn't have a value for setting pod-level hostNetwork, + # so using yq to patch the falco daemonset + echo "Enabling hostNetwork for Falco (required for NPD)..." + + "$YQ_PATH" e -i \ + 'select(.kind == "DaemonSet" and .metadata.name == "falco") + .spec.template.spec.hostNetwork = true' "$TEMP_DIR/falco.yaml" +fi + +if [ "$USE_NRR" = "true" ]; then + echo "Adding Node Readiness Reporter sidecar..." + + "$YQ_PATH" e -i \ + 'select(.kind == "DaemonSet" and .metadata.name == "falco") + .spec.template.spec.containers += + [load("'"$SCRIPT_DIR"'/nrr-variant/security-agent-patcher-sidecar.yaml")]' "$TEMP_DIR/falco.yaml" +fi + +echo "Applying Falco manifests..." +kubectl apply $KUBECTL_ARGS -f "$TEMP_DIR/falco.yaml" + +if [ "$USE_NRR" = "true" ]; then + echo "Applying RBAC for Node Readiness Reporter..." + kubectl apply $KUBECTL_ARGS -f "$SCRIPT_DIR/nrr-variant/falco-rbac-node-status-rbac.yaml" +fi + +if [ "$USE_NPD" = "true" ]; then + echo "" + echo "=== Deploying Node Problem Detector (NPD) ===" + kubectl apply -f "$SCRIPT_DIR/npd-variant/npd-rbac.yaml" + kubectl apply -f "$SCRIPT_DIR/npd-variant/npd-falco-config.yaml" + kubectl apply -f "$SCRIPT_DIR/npd-variant/npd-daemonset.yaml" + + echo "Adding toleration to NPD DaemonSet..." + kubectl patch daemonset node-problem-detector-falco -n falco --type='json' -p='[ + { + "op": "add", + "path": "/spec/template/spec/tolerations/-", + "value": { + "key": "readiness.k8s.io/security-agent-ready", + "operator": "Exists", + "effect": "NoSchedule" + } + } + ]' 2>/dev/null || echo "Toleration already exists or will be added on next update" + + echo "NPD deployed successfully" +fi + +echo "" +echo "=== Falco installed successfully ===" +echo "" +if [ "$USE_NPD" = "true" ]; then + echo "Next step:" + echo "1. Apply NodeReadinessRule: kubectl apply -f $SCRIPT_DIR/npd-variant/security-agent-readiness-rule-npd.yaml" + echo "2. Add toleration to Falco: $SCRIPT_DIR/add-falco-toleration.sh" +else + echo "Next steps:" + echo "1. Apply NodeReadinessRule: kubectl apply -f $SCRIPT_DIR/nrr-variant/security-agent-readiness-rule.yaml" + echo "2. Add toleration to Falco: $SCRIPT_DIR/add-falco-toleration.sh" +fi From 9ee032e85cd120e0b0808e13398751d245b9fdf1 Mon Sep 17 00:00:00 2001 From: Priyanka Saggu Date: Sun, 8 Mar 2026 01:18:32 +0530 Subject: [PATCH 2/3] add NPD (Node Problem Detector) variant for security agent readiness --- .../npd-variant/npd-daemonset.yaml | 68 +++++++++++++++++++ .../npd-variant/npd-falco-config.yaml | 47 +++++++++++++ .../npd-variant/npd-rbac.yaml | 45 ++++++++++++ .../security-agent-readiness-rule-npd.yaml | 17 +++++ 4 files changed, 177 insertions(+) create mode 100644 examples/security-agent-readiness/npd-variant/npd-daemonset.yaml create mode 100644 examples/security-agent-readiness/npd-variant/npd-falco-config.yaml create mode 100644 examples/security-agent-readiness/npd-variant/npd-rbac.yaml create mode 100644 examples/security-agent-readiness/npd-variant/security-agent-readiness-rule-npd.yaml diff --git a/examples/security-agent-readiness/npd-variant/npd-daemonset.yaml b/examples/security-agent-readiness/npd-variant/npd-daemonset.yaml new file mode 100644 index 0000000..72cefed --- /dev/null +++ b/examples/security-agent-readiness/npd-variant/npd-daemonset.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector-falco + namespace: falco + labels: + app: node-problem-detector + component: falco-monitor +spec: + selector: + matchLabels: + app: node-problem-detector + component: falco-monitor + template: + metadata: + labels: + app: node-problem-detector + component: falco-monitor + spec: + serviceAccountName: node-problem-detector + hostNetwork: true + tolerations: + - operator: Exists + effect: NoSchedule + - operator: Exists + effect: NoExecute + containers: + - name: node-problem-detector + image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19 + command: + - /node-problem-detector + - --logtostderr + - --config.custom-plugin-monitor=/config/falco-plugin.json + securityContext: + privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: config + mountPath: /config + readOnly: true + - name: plugin + mountPath: /config/plugin + readOnly: true + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 20m + memory: 64Mi + volumes: + - name: config + configMap: + name: npd-falco-config + items: + - key: falco-plugin.json + path: falco-plugin.json + - name: plugin + configMap: + name: npd-falco-config + defaultMode: 0755 + items: + - key: check-falco.sh + path: check-falco.sh diff --git a/examples/security-agent-readiness/npd-variant/npd-falco-config.yaml b/examples/security-agent-readiness/npd-variant/npd-falco-config.yaml new file mode 100644 index 0000000..c786193 --- /dev/null +++ b/examples/security-agent-readiness/npd-variant/npd-falco-config.yaml @@ -0,0 +1,47 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: npd-falco-config + namespace: falco +data: + # NPD uses problem-oriented conditions (like MemoryPressure, DiskPressure). + # falco.org/FalcoNotReady=False means Falco is healthy, falco.org/FalcoNotReady=True means there's an issue. + falco-plugin.json: | + { + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "5s", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "falco-monitor", + "conditions": [ + { + "type": "falco.org/FalcoNotReady", + "reason": "FalcoHealthy", + "message": "Falco security monitoring is functional" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "falco.org/FalcoNotReady", + "reason": "FalcoNotDeployed", + "path": "/config/plugin/check-falco.sh" + } + ] + } + + check-falco.sh: | + #!/bin/bash + # Check if Falco is deployed and healthy + # Exit 0 when healthy (FalcoNotReady=False, i.e., Falco IS ready) + # Exit 1 when NOT healthy/deployed (FalcoNotReady=True, i.e., Falco is NOT ready) + timeout 2 bash -c '/dev/null + if [ $? -eq 0 ]; then + exit 0 # Falco is healthy + else + echo "Falco is not deployed or not responding on port 8765" + exit 1 # Falco has a problem + fi diff --git a/examples/security-agent-readiness/npd-variant/npd-rbac.yaml b/examples/security-agent-readiness/npd-variant/npd-rbac.yaml new file mode 100644 index 0000000..148d949 --- /dev/null +++ b/examples/security-agent-readiness/npd-variant/npd-rbac.yaml @@ -0,0 +1,45 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-problem-detector + namespace: falco +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-problem-detector-falco +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - get +- apiGroups: + - "" + resources: + - nodes/status + verbs: + - patch + - update +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-problem-detector-falco +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-problem-detector-falco +subjects: +- kind: ServiceAccount + name: node-problem-detector + namespace: falco diff --git a/examples/security-agent-readiness/npd-variant/security-agent-readiness-rule-npd.yaml b/examples/security-agent-readiness/npd-variant/security-agent-readiness-rule-npd.yaml new file mode 100644 index 0000000..7922bcf --- /dev/null +++ b/examples/security-agent-readiness/npd-variant/security-agent-readiness-rule-npd.yaml @@ -0,0 +1,17 @@ +apiVersion: readiness.node.x-k8s.io/v1alpha1 +kind: NodeReadinessRule +metadata: + name: security-agent-readiness-rule-npd +spec: + conditions: + - type: falco.org/FalcoNotReady + requiredStatus: "False" # Remove taint when Falco is NOT NotReady (i.e., Falco IS ready) + taint: + key: "readiness.k8s.io/security-agent-ready" + value: "pending" + effect: "NoSchedule" + enforcementMode: "continuous" + nodeSelector: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist From 29669b6c7aefae3bacc78288f3a1f6cbafd3a9e4 Mon Sep 17 00:00:00 2001 From: Priyanka Saggu Date: Sun, 8 Mar 2026 01:18:40 +0530 Subject: [PATCH 3/3] update docs to include NPD for componment status probing --- .../src/examples/security-agent-readiness.md | 215 ++++++++++++++---- .../kind-cluster-config.yaml | 12 + 2 files changed, 180 insertions(+), 47 deletions(-) create mode 100644 examples/security-agent-readiness/kind-cluster-config.yaml diff --git a/docs/book/src/examples/security-agent-readiness.md b/docs/book/src/examples/security-agent-readiness.md index c4a1852..2a9fc59 100644 --- a/docs/book/src/examples/security-agent-readiness.md +++ b/docs/book/src/examples/security-agent-readiness.md @@ -14,7 +14,7 @@ In many Kubernetes clusters, security agents are deployed as DaemonSets. When a ## The Solution We can use the Node Readiness Controller to enforce a security readiness guardrail: -1. **Taint** the node with a [startup taint](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) `readiness.k8s.io/falco.org/security-agent-ready=pending:NoSchedule` as soon as it joins the cluster. +1. **Taint** the node with a [startup taint](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) `readiness.k8s.io/security-agent-ready=pending:NoSchedule` as soon as it joins the cluster. 2. **Monitor** the security agent’s readiness using a sidecar and expose it as a Node Condition. 3. **Untaint** the node only after the security agent reports that it is ready. @@ -24,13 +24,33 @@ This example uses **Falco** as a representative security agent, but the same pat > **Note**: All manifests referenced in this guide are available in the [`examples/security-agent-readiness`](https://github.com/kubernetes-sigs/node-readiness-controller/tree/main/examples/security-agent-readiness) directory. +### Prerequisites +**1. Node Readiness Controller:** + +Before starting, ensure the Node Readiness Controller is deployed. See the [Installation Guide](../user-guide/installation.md) for details. + +**2. Kubernetes Cluster with Worker Nodes:** + +This example requires at least one worker node with the startup taint. For kind clusters, use the provided configuration: + +```sh +kind create cluster --config examples/security-agent-readiness/kind-cluster-config.yaml +``` + +This creates a cluster with: +- 1 control-plane node +- 1 worker node pre-tainted with `readiness.k8s.io/security-agent-ready=pending:NoSchedule` + +See [`examples/security-agent-readiness/kind-cluster-config.yaml`](../../../../examples/security-agent-readiness/kind-cluster-config.yaml) for details. ### 1. Deploy the Readiness Condition Reporter -To bridge the security agent’s internal health signal to Kubernetes, we deploy a readiness reporter that updates a Node Condition. In this example, the reporter is deployed as a sidecar container in the Falco DaemonSet. Components that natively update Node conditions would not require this additional container. +To bridge the security agent's internal health signal to Kubernetes, we need to update a Node Condition. You have two options: -This sidecar periodically checks Falco's local health endpoint (`http://localhost:8765/healthz`) and updates a Node Condition `falco.org/FalcoReady`. +#### Option A: Using Node Readiness Reporter Sidecar + +The reporter is deployed as a sidecar container in the Falco DaemonSet. This sidecar periodically checks Falco's local health endpoint (`http://localhost:8765/healthz`) and updates a Node Condition `falco.org/FalcoReady`. **Patch your Falco DaemonSet:** @@ -59,47 +79,75 @@ This sidecar periodically checks Falco's local health endpoint (`http://localhos memory: "32Mi" ``` -> Note: In this example, the security agent’s health is monitored by a side-car, so the reporter’s lifecycle is the same as the pod lifecycle. If the Falco pod is crashlooping, the sidecar will not run and cannot report readiness. For robust `continuous` readiness reporting, the reporter should be deployed independently of the security agent pod. For example, a separate DaemonSet (similar to Node Problem Detector) can monitor the agent and update Node conditions even if the agent pod crashes. +**Note:** The sidecar's lifecycle is tied to the Falco pod. If Falco crashes, the sidecar stops reporting. For more robust monitoring, see Option B below. + +#### Option B: Using Node Problem Detector (More Robust) -### 2. Grant Permissions (RBAC) +If you already have Node Problem Detector (NPD) deployed or want robust monitoring that continues even if Falco crashes, use NPD with a custom plugin. -The readiness reporter sidecar needs permission to update the Node object's status to publish readiness information. +**Deploy NPD with Falco monitoring plugin:** ```yaml -# security-agent-node-status-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: node-status-patch-role -rules: -- apiGroups: [""] - resources: ["nodes"] - verbs: ["get"] -- apiGroups: [""] - resources: ["nodes/status"] - verbs: ["patch", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +# npd-falco-config.yaml +apiVersion: v1 +kind: ConfigMap metadata: - name: security-agent-node-status-patch-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: node-status-patch-role -subjects: -# Bind to security agent's ServiceAccount -- kind: ServiceAccount - name: falco - namespace: kube-system + name: npd-falco-config + namespace: falco +data: + # NPD uses problem-oriented conditions (like MemoryPressure, DiskPressure). + # falco.org/FalcoNotReady=False means Falco is healthy, falco.org/FalcoNotReady=True means there's an issue. + falco-plugin.json: | + { + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "5s", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "falco-monitor", + "conditions": [ + { + "type": "falco.org/FalcoNotReady", + "reason": "FalcoHealthy", + "message": "Falco security monitoring is functional" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "falco.org/FalcoNotReady", + "reason": "FalcoNotDeployed", + "path": "/config/plugin/check-falco.sh" + } + ] + } + + check-falco.sh: | + #!/bin/bash + # Check if Falco is deployed and healthy + # Exit 0 when healthy (FalcoNotReady=False, i.e., Falco IS ready) + # Exit 1 when NOT healthy/deployed (FalcoNotReady=True, i.e., Falco is NOT ready) + timeout 2 bash -c '/dev/null + if [ $? -eq 0 ]; then + exit 0 # Falco is healthy + else + echo "Falco is not deployed or not responding on port 8765" + exit 1 # Falco has a problem + fi ``` -### 3. Create the Node Readiness Rule +Then deploy NPD DaemonSet and RBAC. See complete NPD manifests in [`examples/security-agent-readiness/npd-variant/`](../../../../examples/security-agent-readiness/npd-variant/). -Next, define a NodeReadinessRule that enforces the security readiness requirement. This rule instructs the controller: *"Keep the `readiness.k8s.io/falco.org/security-agent-ready` taint on the node until the `falco.org/FalcoReady` condition becomes True."* +### 2. Create the Node Readiness Rule + +Next, define a NodeReadinessRule that enforces the security readiness requirement. + +**For Option A (Sidecar Reporter):** ```yaml -# security-agent-readiness-rule.yaml +# nrr-variant/security-agent-readiness-rule.yaml apiVersion: readiness.node.x-k8s.io/v1alpha1 kind: NodeReadinessRule metadata: @@ -112,7 +160,7 @@ spec: # Taint managed by this rule taint: - key: "readiness.k8s.io/falco.org/security-agent-ready" + key: "readiness.k8s.io/security-agent-ready" effect: "NoSchedule" value: "pending" @@ -126,30 +174,103 @@ spec: node-role.kubernetes.io/worker: "" ``` +**For Option B (Node Problem Detector):** + +```yaml +# npd-variant/security-agent-readiness-rule-npd.yaml +apiVersion: readiness.node.x-k8s.io/v1alpha1 +kind: NodeReadinessRule +metadata: + name: security-agent-readiness-rule-npd +spec: + # Conditions that must be satisfied before the taint is removed + conditions: + - type: "falco.org/FalcoNotReady" + requiredStatus: "False" # Remove taint when Falco is NOT NotReady (i.e., ready) + + # Taint managed by this rule + taint: + key: "readiness.k8s.io/security-agent-ready" + effect: "NoSchedule" + value: "pending" + + # "bootstrap-only" means: once the security agent is ready, we stop enforcing. + # Use "continuous" mode if you want to re-taint the node if Falco crashes later. + enforcementMode: "continuous" + + # Update to target only the nodes that need to be protected by this guardrail + nodeSelector: + matchLabels: + node-role.kubernetes.io/worker: "" +``` + ## How to Apply -1. **Create the Node Readiness Rule**: +**For Option A (Sidecar Reporter):** + ```sh - cd examples/security-agent-readiness - kubectl apply -f security-agent-readiness-rule.yaml - ``` +# Install Falco with sidecar reporter +USE_NRR=true examples/security-agent-readiness/setup-falco.sh + +# Apply the NodeReadinessRule +kubectl apply -f examples/security-agent-readiness/nrr-variant/security-agent-readiness-rule.yaml + +# Add toleration to Falco so it can start on tainted nodes +examples/security-agent-readiness/add-falco-toleration.sh +``` + +**For Option B (Node Problem Detector):** -2. **Install Falco and Apply the RBAC**: ```sh -chmod +x apply-falco.sh -sh apply-falco.sh +# Install Falco with NPD monitoring +USE_NPD=true examples/security-agent-readiness/setup-falco.sh + +# Apply the NodeReadinessRule for NPD +kubectl apply -f examples/security-agent-readiness/npd-variant/security-agent-readiness-rule-npd.yaml + +# Add toleration to Falco so it can start on tainted nodes +examples/security-agent-readiness/add-falco-toleration.sh ``` ## Verification -To verify that the guardrail is working, add a new node to the cluster. +To verify that the guardrail is working, you need a tainted node. You have two options: + +**Option 1: Manually taint an existing node:** + +```sh +kubectl taint nodes readiness.k8s.io/security-agent-ready=pending:NoSchedule +``` + +**Option 2: Configure nodes to register with taints at startup:** + +For kind clusters, use kubeadm config patches. See [kind documentation on kubeadm config patches](https://kind.sigs.k8s.io/docs/user/configuration/#kubeadm-config-patches) for details. + +--- + +Once the node is tainted: 1. **Check the Node Taints**: -Immediately after the node joins, it should have the taint: -`readiness.k8s.io/falco.org/security-agent-ready=pending:NoSchedule`. + Verify the taint is applied: + ```sh + kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints + ``` + Should show: `readiness.k8s.io/security-agent-ready=pending:NoSchedule`. 2. **Check Node Conditions**: -Observe the node’s conditions. You will initially see `falco.org/FalcoReady` as `False` or missing. Once Falco initializes, the sidecar reporter updates the condition to `True`. + + **For Option A (Sidecar):** + ```sh + kubectl get node -o jsonpath='{.status.conditions[?(@.type=="falco.org/FalcoReady")]}' | jq . + ``` + You will initially see `falco.org/FalcoReady` as `False`. Once Falco initializes, it becomes `True`. + + **For Option B (NPD):** + ```sh + kubectl get node -o jsonpath='{.status.conditions[?(@.type=="falco.org/FalcoNotReady")]}' | jq . + ``` + You will initially see `falco.org/FalcoNotReady=True` (not ready). Once Falco is healthy, it becomes `falco.org/FalcoNotReady=False` (ready). + 3. **Check Taint Removal**: -As soon as the condition becomes `True`, the Node Readiness Controller removes the taint, allowing workloads to be scheduled on the node. + As soon as the condition reaches the required status, the Node Readiness Controller removes the taint, allowing workloads to be scheduled on the node. diff --git a/examples/security-agent-readiness/kind-cluster-config.yaml b/examples/security-agent-readiness/kind-cluster-config.yaml new file mode 100644 index 0000000..ce65f2c --- /dev/null +++ b/examples/security-agent-readiness/kind-cluster-config.yaml @@ -0,0 +1,12 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: security-agent-demo +nodes: +- role: control-plane +- role: worker + kubeadmConfigPatches: + - | + kind: JoinConfiguration + nodeRegistration: + kubeletExtraArgs: + register-with-taints: "readiness.k8s.io/security-agent-ready=pending:NoSchedule"