Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 211 additions & 108 deletions aks-node-controller/pkg/gen/aksnodeconfig/v1/kubelet_config.pb.go

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions aks-node-controller/proto/aksnodeconfig/v1/kubelet_config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,41 @@ message KubeletConfigFileConfig {
+optional. */
repeated string enforce_node_allocatable = 37;

/* evictionSoft is a map of signal names to quantities that defines soft eviction thresholds.
Each signal listed here must also have a corresponding entry in evictionSoftGracePeriod.
Soft eviction terminates pods gracefully (respecting terminationGracePeriodSeconds, capped by
evictionMaxPodGracePeriod) once the threshold is breached for the configured grace period.
Used by AKS Node Memory Hardening (F2/F5).
+optional. */
map<string, string> eviction_soft = 41;

/* evictionSoftGracePeriod is a map of signal names to durations defining how long the soft
eviction threshold must be breached before triggering eviction. Each entry must correspond
to a signal listed in evictionSoft. Used by AKS Node Memory Hardening (F2/F5).
+optional. */
map<string, string> eviction_soft_grace_period = 42;

/* evictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when
terminating pods in response to a soft eviction threshold being met. Setting this value
caps the pod's terminationGracePeriodSeconds during soft eviction. Used by AKS Node
Memory Hardening (F2/F5).
+optional. */
int32 eviction_max_pod_grace_period = 43;

/* kubeReservedCgroup is the absolute name of the cgroup the kubelet should manage for the
kube-reserved compute resources. When enforce-node-allocatable contains "kube-reserved",
this cgroup must exist before kubelet starts. Example: "/kubelet.slice".
Used by AKS Node Memory Hardening (F2/F5).
+optional. */
string kube_reserved_cgroup = 44;

/* systemReservedCgroup is the absolute name of the cgroup the kubelet should manage for the
system-reserved compute resources. When enforce-node-allocatable contains "system-reserved",
this cgroup must exist before kubelet starts. Example: "/system.slice".
Used by AKS Node Memory Hardening (F2/F5).
+optional. */
string system_reserved_cgroup = 45;

/* A comma separated whitelist of unsafe sysctls or sysctl patterns (ending in *).
Unsafe sysctl groups are kernel.shm*, kernel.msg*, kernel.sem, fs.mqueue.*, and net.*.
These sysctls are namespaced but not allowed by default.
Expand Down
11 changes: 11 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,17 @@ EOF
local tls_bootstrapping_start_time_filepath="/opt/azure/containers/tls-bootstrap-start-time"
date +"%F %T.%3N" > "${tls_bootstrapping_start_time_filepath}"

# Node Memory Hardening (F2/F5): if the RP rendered --kube-reserved-cgroup or
# --system-reserved-cgroup, ensure the corresponding systemd slices exist before
# kubelet starts so its NodeAllocatable enforcement loop can find them. The
# helper is a no-op when neither value is present (back-compat with non-hardened pools).
resolveKubeletReservedCgroups
if [ -n "${KUBE_RESERVED_CGROUP}" ] || [ -n "${SYSTEM_RESERVED_CGROUP}" ]; then
if ! logs_to_events "AKS.CSE.ensureKubelet.ensureKubeletCgroupHierarchy" ensureKubeletCgroupHierarchy; then
exit $ERR_KUBELET_START_FAIL
fi
fi
Comment thread
mxj220 marked this conversation as resolved.

# start kubelet.service without waiting for the main process to start, though check whether it has entered a failed state after enablement
if ! systemctlEnableAndStartNoBlock kubelet 240; then
# append kubelet status to CSE output to ensure we can see it
Expand Down
128 changes: 128 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1452,4 +1452,132 @@ function get_sandbox_image_from_containerd_config() {

echo "$sandbox_image"
}

# ensureKubeletCgroupHierarchy creates the systemd slices used by kubelet for the
# kube-reserved and system-reserved enforcement tiers (Node Memory Hardening F2/F5).
# It MUST be called before kubelet starts so that /kubelet.slice and /system.slice
# exist and are managed by systemd before the first kubelet enforcement pass.
#
# The function:
# - Asserts cgroupv2 unified hierarchy (cgroupv1 is not supported by this feature
# because mixed/legacy hierarchies cannot reliably enforce per-slice MemoryMax).
# - Drops a /etc/systemd/system/kubelet.slice unit (system.slice ships with systemd).
# - Triggers `systemctl daemon-reload` and `systemctl start kubelet.slice` so the
# cgroup is materialised at /sys/fs/cgroup/kubelet.slice prior to kubelet boot.
#
# Inputs (env, all optional — the function is a no-op if the RP did not opt in):
# KUBE_RESERVED_CGROUP — absolute cgroup name, e.g. "/kubelet.slice"
# SYSTEM_RESERVED_CGROUP — absolute cgroup name, e.g. "/system.slice"

# resolveKubeletReservedCgroups exports KUBE_RESERVED_CGROUP and SYSTEM_RESERVED_CGROUP
# from either the kubelet config-file JSON (when KUBELET_CONFIG_FILE_ENABLED=true) or
# from KUBELET_FLAGS as a fallback. Both vars are unset (empty string) when the RP did
# not opt the pool into Node Memory Hardening, which keeps ensureKubeletCgroupHierarchy
# a no-op for non-hardened pools.
#
# When kubelet config-file mode is enabled, --kube-reserved-cgroup /
# --system-reserved-cgroup are filtered out of KUBELET_FLAGS by the RP
# (TranslatedKubeletConfigFlags) and rendered into kubeletconfig.json instead, so
# we must source the cgroup names from the JSON in that mode.
resolveKubeletReservedCgroups() {
KUBE_RESERVED_CGROUP=""
SYSTEM_RESERVED_CGROUP=""
if [ "${KUBELET_CONFIG_FILE_ENABLED:-}" = "true" ] && [ -n "${KUBELET_CONFIG_FILE_CONTENT:-}" ]; then
KUBE_RESERVED_CGROUP=$(echo "${KUBELET_CONFIG_FILE_CONTENT}" | base64 -d | jq -r '.kubeReservedCgroup // ""')
SYSTEM_RESERVED_CGROUP=$(echo "${KUBELET_CONFIG_FILE_CONTENT}" | base64 -d | jq -r '.systemReservedCgroup // ""')
else
KUBE_RESERVED_CGROUP=$(extract_value_from_kubelet_flags "${KUBELET_FLAGS:-}" "kube-reserved-cgroup")
SYSTEM_RESERVED_CGROUP=$(extract_value_from_kubelet_flags "${KUBELET_FLAGS:-}" "system-reserved-cgroup")
fi
export KUBE_RESERVED_CGROUP SYSTEM_RESERVED_CGROUP
}

ensureKubeletCgroupHierarchy() {
Comment thread
mxj220 marked this conversation as resolved.
if [ -z "${KUBE_RESERVED_CGROUP:-}" ] && [ -z "${SYSTEM_RESERVED_CGROUP:-}" ]; then
return 0
fi

# Path overrides exist for ShellSpec coverage; production callers leave them at
# their defaults.
local cgroupv2_marker="${CGROUPV2_MARKER_PATH:-/sys/fs/cgroup/cgroup.controllers}"
local kubelet_slice_unit="${KUBELET_SLICE_UNIT_PATH:-/etc/systemd/system/kubelet.slice}"
local kubelet_dropin_dir="${KUBELET_SERVICE_DROPIN_DIR:-/etc/systemd/system/kubelet.service.d}"

# Assert cgroupv2 unified hierarchy. The canonical marker is the presence of
# /sys/fs/cgroup/cgroup.controllers, which only exists under cgroupv2.
if [ ! -f "${cgroupv2_marker}" ]; then
echo "ensureKubeletCgroupHierarchy: cgroupv2 unified hierarchy not detected; node memory hardening cgroup enforcement requires cgroupv2"
return 1
fi

# Validate supported values: only /kubelet.slice (or bare kubelet.slice) is
# supported for KUBE_RESERVED_CGROUP, and only /system.slice for
# SYSTEM_RESERVED_CGROUP (a built-in systemd slice). Reject any other value
# explicitly so kubelet doesn't fail later with an opaque enforcement error.
case "${KUBE_RESERVED_CGROUP:-}" in
""|"/kubelet.slice"|"kubelet.slice") ;;
*)
echo "ensureKubeletCgroupHierarchy: unsupported KUBE_RESERVED_CGROUP=${KUBE_RESERVED_CGROUP}; only /kubelet.slice is supported"
return 1
;;
esac
case "${SYSTEM_RESERVED_CGROUP:-}" in
""|"/system.slice"|"system.slice") ;;
*)
echo "ensureKubeletCgroupHierarchy: unsupported SYSTEM_RESERVED_CGROUP=${SYSTEM_RESERVED_CGROUP}; only /system.slice is supported"
return 1
;;
esac

# /system.slice is a built-in systemd slice; we only need to create kubelet.slice.
if [ "${KUBE_RESERVED_CGROUP:-}" = "/kubelet.slice" ] || [ "${KUBE_RESERVED_CGROUP:-}" = "kubelet.slice" ]; then
if [ ! -f "${kubelet_slice_unit}" ]; then
mkdir -p "$(dirname "${kubelet_slice_unit}")"
# [Install] WantedBy=slices.target ensures the slice is pulled in by
# systemd on every boot (including post-reboot), not only the current
# provisioning boot. Combined with the Before=kubelet.service drop-in
# below this guarantees /sys/fs/cgroup/kubelet.slice is materialised
# before kubelet starts, so NodeAllocatable enforcement does not race.
tee "${kubelet_slice_unit}" > /dev/null <<'EOF'
[Unit]
Description=Slice for kubelet kube-reserved enforcement (AKS Node Memory Hardening)
Before=slices.target
DefaultDependencies=no

[Slice]

[Install]
WantedBy=slices.target
EOF
chmod 0644 "${kubelet_slice_unit}"

# Drop-in on kubelet.service so systemd starts kubelet.slice first
# on every boot. This survives reboots without depending on the
# one-shot `systemctl start` below.
mkdir -p "${kubelet_dropin_dir}"
tee "${kubelet_dropin_dir}/10-kubelet-slice.conf" > /dev/null <<'EOF'
[Unit]
Wants=kubelet.slice
After=kubelet.slice
EOF
chmod 0644 "${kubelet_dropin_dir}/10-kubelet-slice.conf"

systemctl daemon-reload

# Enable the slice so it is started on subsequent boots.
if ! systemctl enable kubelet.slice; then
echo "ensureKubeletCgroupHierarchy: failed to enable kubelet.slice"
return 1
fi
fi

# Materialise the cgroup tree at /sys/fs/cgroup/kubelet.slice before kubelet starts on this boot.
if ! systemctl start kubelet.slice; then
echo "ensureKubeletCgroupHierarchy: failed to start kubelet.slice"
Comment thread
mxj220 marked this conversation as resolved.
return 1
fi
fi

Comment thread
mxj220 marked this conversation as resolved.
return 0
}
#HELPERSEOF
28 changes: 28 additions & 0 deletions pkg/agent/datamodel/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2229,6 +2229,24 @@ type AKSKubeletConfiguration struct {
imagefs.available: "15%"
+optional. */
EvictionHard map[string]string `json:"evictionHard,omitempty"`
/* evictionSoft is a map of signal names to quantities that defines soft eviction thresholds.
For example: {"memory.available": "300Mi"}.
Each signal listed here must also have a corresponding entry in evictionSoftGracePeriod.
Soft eviction terminates pods gracefully (respecting terminationGracePeriodSeconds, capped by
evictionMaxPodGracePeriod) once the threshold is breached for the configured grace period.
+optional. */
EvictionSoft map[string]string `json:"evictionSoft,omitempty"`
/* evictionSoftGracePeriod is a map of signal names to durations defining how long the soft
eviction threshold must be breached before triggering eviction. Example:
{"memory.available": "30s", "nodefs.available": "2m"}.
Each entry must correspond to a signal listed in evictionSoft.
+optional. */
EvictionSoftGracePeriod map[string]string `json:"evictionSoftGracePeriod,omitempty"`
/* evictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when
terminating pods in response to a soft eviction threshold being met. Setting this value
caps the pod's terminationGracePeriodSeconds during soft eviction.
+optional. */
EvictionMaxPodGracePeriod int32 `json:"evictionMaxPodGracePeriod,omitempty"`
/* protectKernelDefaults, if true, causes the Kubelet to error if kernel
flags are not as it expects. Otherwise the Kubelet will attempt to modify
kernel flags to match its expectation.
Expand Down Expand Up @@ -2308,6 +2326,16 @@ type AKSKubeletConfiguration struct {
Default: ["pods"]
+optional. */
EnforceNodeAllocatable []string `json:"enforceNodeAllocatable,omitempty"`
/* kubeReservedCgroup is the absolute name of the cgroup the kubelet should manage
for the kube-reserved compute resources. When enforce-node-allocatable contains
"kube-reserved", this cgroup must exist before kubelet starts. Example: "/kubelet.slice".
+optional. */
KubeReservedCgroup string `json:"kubeReservedCgroup,omitempty"`
/* systemReservedCgroup is the absolute name of the cgroup the kubelet should manage
for the system-reserved compute resources. When enforce-node-allocatable contains
"system-reserved", this cgroup must exist before kubelet starts. Example: "/system.slice".
+optional. */
SystemReservedCgroup string `json:"systemReservedCgroup,omitempty"`
/* A comma separated whitelist of unsafe sysctls or sysctl patterns (ending in *).
Unsafe sysctl groups are kernel.shm*, kernel.msg*, kernel.sem, fs.mqueue.*, and net.*.
These sysctls are namespaced but not allowed by default.
Expand Down
32 changes: 27 additions & 5 deletions pkg/agent/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,18 @@ var TranslatedKubeletConfigFlags = map[string]bool{
"--cluster-domain": true,
"--max-pods": true,
"--eviction-hard": true,
"--eviction-soft": true,
"--eviction-soft-grace-period": true,
"--eviction-max-pod-grace-period": true,
Comment thread
mxj220 marked this conversation as resolved.
"--node-status-update-frequency": true,
"--node-status-report-frequency": true,
"--image-gc-high-threshold": true,
"--image-gc-low-threshold": true,
"--event-qps": true,
"--pod-max-pids": true,
"--enforce-node-allocatable": true,
"--kube-reserved-cgroup": true,
"--system-reserved-cgroup": true,
"--streaming-connection-idle-timeout": true,
"--rotate-certificates": true,
"--rotate-server-certificates": true,
Expand Down Expand Up @@ -500,6 +505,8 @@ func getAKSKubeletConfiguration(kc map[string]string) *datamodel.AKSKubeletConfi
EventRecordQPS: strToInt32Ptr(kc["--event-qps"]),
PodPidsLimit: strToInt64Ptr(kc["--pod-max-pids"]),
EnforceNodeAllocatable: strings.Split(kc["--enforce-node-allocatable"], ","),
KubeReservedCgroup: kc["--kube-reserved-cgroup"],
SystemReservedCgroup: kc["--system-reserved-cgroup"],
StreamingConnectionIdleTimeout: datamodel.Duration(kc["--streaming-connection-idle-timeout"]),
RotateCertificates: strToBool(kc["--rotate-certificates"]),
ServerTLSBootstrap: strToBool(kc["--rotate-server-certificates"]),
Expand Down Expand Up @@ -592,7 +599,22 @@ func GetKubeletConfigFileContent(kc map[string]string, customKc *datamodel.Custo
// EvictionHard.
// default: "memory.available<750Mi,nodefs.available<10%,nodefs.inodesFree<5%".
if eh, ok := kc["--eviction-hard"]; ok && eh != "" {
kubeletConfig.EvictionHard = strKeyValToMap(eh, ",", "<")
kubeletConfig.EvictionHard = strKeyValToMap(eh, "<")
}

// EvictionSoft (e.g. "memory.available<500Mi,nodefs.available<15%,imagefs.available<20%").
if es, ok := kc["--eviction-soft"]; ok && es != "" {
kubeletConfig.EvictionSoft = strKeyValToMap(es, "<")
}

// EvictionSoftGracePeriod (e.g. "memory.available=30s,nodefs.available=2m,imagefs.available=2m").
if esg, ok := kc["--eviction-soft-grace-period"]; ok && esg != "" {
kubeletConfig.EvictionSoftGracePeriod = strKeyValToMap(esg, "=")
}

// EvictionMaxPodGracePeriod (integer seconds, e.g. "60").
if v, ok := kc["--eviction-max-pod-grace-period"]; ok && v != "" {
kubeletConfig.EvictionMaxPodGracePeriod = strToInt32(v)
}

// feature gates.
Expand All @@ -601,8 +623,8 @@ func GetKubeletConfigFileContent(kc map[string]string, customKc *datamodel.Custo

// system reserve and kube reserve.
// looks like "cpu=100m,memory=1638Mi".
kubeletConfig.SystemReserved = strKeyValToMap(kc["--system-reserved"], ",", "=")
kubeletConfig.KubeReserved = strKeyValToMap(kc["--kube-reserved"], ",", "=")
kubeletConfig.SystemReserved = strKeyValToMap(kc["--system-reserved"], "=")
kubeletConfig.KubeReserved = strKeyValToMap(kc["--kube-reserved"], "=")

// Settings from customKubeletConfig, only take if it's set.
setCustomKubeletConfig(customKc, kubeletConfig)
Expand Down Expand Up @@ -653,9 +675,9 @@ func strToInt64Ptr(str string) *int64 {
return &i
}

func strKeyValToMap(str string, strDelim string, pairDelim string) map[string]string {
func strKeyValToMap(str string, pairDelim string) map[string]string {
m := make(map[string]string)
pairs := strings.Split(str, strDelim)
pairs := strings.Split(str, ",")
for _, pairRaw := range pairs {
pair := strings.Split(pairRaw, pairDelim)
if len(pair) == numInPair {
Expand Down
Loading
Loading