diff --git a/modules/python/clusterloader2/clustermesh-scale/config/annotate-namespaces.sh b/modules/python/clusterloader2/clustermesh-scale/config/annotate-namespaces.sh new file mode 100755 index 0000000000..9c3fb1b5f3 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/annotate-namespaces.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Annotate workload namespaces for ACNS (managed Cilium) opt-in cross-cluster sync. +# +# AKS-managed Cilium ships with `clustermesh-default-global-namespace=false` +# (opt-in mode, per ACNS team confirmation 2026-05-11 from David Vadas / +# Isaiah Raya), unlike upstream Cilium which defaults to opt-out. Without +# the `clustermesh.cilium.io/global: "true"` annotation on the workload +# namespace, NONE of the namespace's resources (CiliumIdentity, +# CiliumEndpoint, CiliumEndpointSlice, Services, ServiceExports) sync +# across the mesh — even if the Service object itself carries +# `service.cilium.io/global: "true"`. The namespace annotation is +# load-bearing; once present, Cilium auto-applies the service-level +# semantics to all services in that namespace. +# +# This script is invoked via `Method: Exec` from each scale-test scenario's +# top-level CL2 config (event-throughput.yaml, pod-churn-*.yaml). It runs +# AFTER CL2 has created the test namespaces (`-1..N`) and BEFORE the +# workload deploy phase, so cross-cluster sync is enabled from the first +# resource creation. +# +# The pre-staged kubectl binary at /root/perf-tests/clusterloader2/config/kubectl +# (set up by steps/engine/clusterloader2/clustermesh-scale/execute.yml) is +# used because the CL2 image does not bundle kubectl. +# +# Positional args: +# $1 NAMESPACE_COUNT How many namespaces (matches CL2's `namespace.number`). +# $2 NAMESPACE_PREFIX Namespace prefix (matches CL2's `namespace.prefix`). + +set -u +set -o pipefail + +NAMESPACE_COUNT="${1:-0}" +NAMESPACE_PREFIX="${2:-}" + +if [ -z "${NAMESPACE_PREFIX}" ] || [ "${NAMESPACE_COUNT}" -lt 1 ]; then + echo "annotate-namespaces ERROR: need positional args (count, prefix); got count='${NAMESPACE_COUNT}' prefix='${NAMESPACE_PREFIX}'" + exit 2 +fi + +# Prefer PATH kubectl, fall back to the pre-staged binary the pipeline +# downloads into the bind-mounted config dir. Mirrors pod-churn-killer.sh's +# fallback path so both scripts behave consistently if the CL2 image +# eventually starts bundling kubectl. +if command -v kubectl >/dev/null 2>&1; then + KUBECTL=kubectl +elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then + KUBECTL=/root/perf-tests/clusterloader2/config/kubectl + echo "annotate-namespaces: using pre-staged kubectl at ${KUBECTL}" +else + echo "annotate-namespaces ERROR: kubectl not in PATH and pre-staged binary missing" + exit 127 +fi + +ANNOTATION="clustermesh.cilium.io/global=true" +echo "annotate-namespaces: applying ${ANNOTATION} to ${NAMESPACE_COUNT} namespaces with prefix '${NAMESPACE_PREFIX}'" + +FAIL_COUNT=0 +for i in $(seq 1 "${NAMESPACE_COUNT}"); do + NS="${NAMESPACE_PREFIX}-${i}" + # --overwrite tolerates re-runs (CL2 retries, multi-step configs). The + # namespace MUST already exist — CL2 creates managed namespaces before + # the first test step runs. If it's missing here, that's a real bug + # worth surfacing as an error (don't --ignore-not-found). + if "${KUBECTL}" annotate namespace "${NS}" "${ANNOTATION}" --overwrite >/dev/null 2>&1; then + echo "annotate-namespaces: ${NS} annotated" + else + echo "annotate-namespaces ERROR: failed to annotate ${NS}" + FAIL_COUNT=$((FAIL_COUNT + 1)) + fi +done + +if [ "${FAIL_COUNT}" -gt 0 ]; then + echo "annotate-namespaces: ${FAIL_COUNT}/${NAMESPACE_COUNT} namespaces failed annotation" + exit 1 +fi + +echo "annotate-namespaces: done, ${NAMESPACE_COUNT} namespaces annotated" +exit 0 diff --git a/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure-killer.sh b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure-killer.sh new file mode 100755 index 0000000000..363f9bbb54 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure-killer.sh @@ -0,0 +1,253 @@ +#!/bin/bash +# Scenario #4 (ClusterMesh APIServer Failure) — kills clustermesh-apiserver +# pod on the designated target cluster, then waits for the replacement pod +# to reach Ready. Records timestamps for post-hoc recovery-time analysis. +# +# Per-cluster CL2 execution model: this script runs from inside EVERY +# cluster's CL2 docker container, but no-ops on non-target clusters. The +# target is identified by `kubectl config current-context` — `az aks +# get-credentials` writes context = AKS cluster name (e.g. "clustermesh-1"), +# which matches what we pass as the target arg. +# +# Positional args: +# $1 TARGET_CONTEXT kubectl context name of the target cluster +# (e.g. "clustermesh-1"). Skip if mismatched. +# $2 RECOVERY_TIMEOUT_SECONDS How long to wait for replacement pod Ready. +# $3 REPORT_DIR (optional) Path inside the CL2 container +# where the timing JSON is written. Defaults +# to /root/perf-tests/clusterloader2/results. +# +# Output: +# Writes $REPORT_DIR/ApiserverFailureTimings_.json (target only). +# scale.py collect reads this file and emits an ApiserverFailureRecoveryTiming +# row into the aggregated JSONL. +# +# Exit codes: +# 0 — non-target (no-op) OR target with verified kill + recovery. +# 1 — target attempt failed somewhere (no pod matched, kubectl failed, +# recovery timeout). Writes the timing file with `recovered:false` +# so collect can still surface that the scenario was attempted. + +set -uo pipefail + +TARGET_CONTEXT="${1:-clustermesh-1}" +RECOVERY_TIMEOUT_SECONDS="${2:-120}" +REPORT_DIR="${3:-/root/perf-tests/clusterloader2/results}" + +# Same fallback pattern as pod-churn-killer.sh — prefer PATH kubectl, fall +# back to the pre-staged binary at the bind-mounted config dir. +if command -v kubectl >/dev/null 2>&1; then + KUBECTL=kubectl +elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then + KUBECTL=/root/perf-tests/clusterloader2/config/kubectl + echo "apiserver-failure-killer: using pre-staged kubectl at ${KUBECTL}" +else + echo "apiserver-failure-killer ERROR: kubectl not in PATH and pre-staged binary missing" + exit 127 +fi + +CURRENT_CONTEXT=$("${KUBECTL}" config current-context 2>/dev/null || echo "unknown") +echo "apiserver-failure-killer: current=${CURRENT_CONTEXT} target=${TARGET_CONTEXT}" + +if [ "${CURRENT_CONTEXT}" != "${TARGET_CONTEXT}" ]; then + echo "apiserver-failure-killer: not target cluster, no-op" + exit 0 +fi + +# ----- Target cluster path ----- +mkdir -p "${REPORT_DIR}" +TIMING_FILE="${REPORT_DIR}/ApiserverFailureTimings_${CURRENT_CONTEXT}.json" + +write_timing() { + # Args: t0_epoch t1_epoch_or_zero recovered_flag pod_name pod_uid_old pod_uid_new note + local t0="$1" t1="$2" recovered="$3" pod_name="$4" uid_old="$5" uid_new="$6" note="$7" + local dur=0 + if [ "${t1}" -gt 0 ] && [ "${t0}" -gt 0 ]; then + dur=$((t1 - t0)) + fi + cat > "${TIMING_FILE}" <1 (scenario #7), the wait-for-new-pod +# loop must distinguish "new replacement pod" from "the OTHER surviving +# replicas that were already Ready before the kill" — a single-UID compare +# matches the surviving pods immediately and falsely reports recovered=0s. +# Rubber-duck critique blocker #2. +PRE_KILL_PODS=$("${KUBECTL}" -n kube-system get pods \ + -l k8s-app=clustermesh-apiserver \ + -o 'jsonpath={range .items[*]}{.metadata.name}={.metadata.uid}={.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \ + 2>/dev/null | grep -v '^$') + +if [ -z "${PRE_KILL_PODS}" ]; then + echo "apiserver-failure-killer ERROR: no clustermesh-apiserver pod matched label selector" + PRE_KILL_REPLICAS=0 + READY_PODS_AT_KILL=0 + write_timing 0 0 false "" "" "" "no pod matched label selector k8s-app=clustermesh-apiserver" + exit 1 +fi + +PRE_KILL_REPLICAS=$(echo "${PRE_KILL_PODS}" | wc -l | tr -d ' ') +READY_PODS_AT_KILL=$(echo "${PRE_KILL_PODS}" | awk -F'=' '$3=="True"{c++} END{print c+0}') +# Newline-separated list of pre-kill UIDs — used to filter the recovery +# wait loop's candidate set. +PRE_KILL_UIDS=$(echo "${PRE_KILL_PODS}" | awk -F'=' '{print $2}') + +# Pick the first Ready pod as the kill target (preserves prior behavior for +# scenario #4). If no Ready pod, fall back to first pod. +TARGET_LINE=$(echo "${PRE_KILL_PODS}" | awk -F'=' '$3=="True"{print; exit}') +if [ -z "${TARGET_LINE}" ]; then + TARGET_LINE=$(echo "${PRE_KILL_PODS}" | head -1) +fi +POD_NAME="${TARGET_LINE%%=*}" +_REST="${TARGET_LINE#*=}" +POD_UID="${_REST%=*}" +echo "apiserver-failure-killer: pre-kill replicas=${PRE_KILL_REPLICAS} ready=${READY_PODS_AT_KILL}" +echo "apiserver-failure-killer: target pod ${POD_NAME} uid=${POD_UID}" + +# 2. Delete exactly that pod by name (not by label selector — prevents +# accidental multi-pod kill on future HA setups). +T0=$(date +%s) +echo "apiserver-failure-killer: t0=${T0} deleting pod ${POD_NAME} (hard kill, --grace-period=0 --force)" +if ! "${KUBECTL}" -n kube-system delete pod "${POD_NAME}" \ + --grace-period=0 --force >/dev/null 2>&1; then + echo "apiserver-failure-killer ERROR: kubectl delete pod ${POD_NAME} failed" + write_timing "${T0}" 0 false "${POD_NAME}" "${POD_UID}" "" "kubectl delete failed" + exit 1 +fi + +# 3. Wait for replacement pod to reach Ready. Per rubber-duck #6: +# Ready (not just Running) is what matters — apiserver may be Running +# while still loading certs / unable to serve mesh traffic. +# +# Periodic state samples (every 30s) write to a diag log so we can see +# what kubelet/scheduler/operator were doing during recovery — instead +# of just "timed out" with no signal. +DIAG_LOG="${REPORT_DIR}/ApiserverFailureDiag_${CURRENT_CONTEXT}.log" +: > "${DIAG_LOG}" + +dump_state() { + local label="$1" + { + echo "===== ${label} at $(date -u +"%Y-%m-%dT%H:%M:%SZ") (epoch=$(date +%s)) =====" + echo "--- pods (k8s-app=clustermesh-apiserver) ---" + "${KUBECTL}" -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide 2>&1 || true + echo "--- pod UIDs + readiness ---" + "${KUBECTL}" -n kube-system get pods -l k8s-app=clustermesh-apiserver \ + -o 'jsonpath={range .items[*]}{.metadata.name}{" uid="}{.metadata.uid}{" phase="}{.status.phase}{" ready="}{.status.conditions[?(@.type=="Ready")].status}{" reason="}{.status.conditions[?(@.type=="Ready")].reason}{"\n"}{end}' 2>&1 || true + # tee'd to BOTH the file AND stdout so the AzDO step log carries the + # same diag info as the file. AzDO pipeline artifacts aren't published + # for our scenarios — the agent's report dir is torn down with the job + # — so without stdout duplication the diag is unreachable. + } 2>&1 | tee -a "${DIAG_LOG}" +} + +RECOVERY_DEADLINE=$((T0 + RECOVERY_TIMEOUT_SECONDS)) +NEW_POD_NAME="" +NEW_POD_UID="" +NEXT_SAMPLE=$((T0 + 30)) +while [ "$(date +%s)" -lt "${RECOVERY_DEADLINE}" ]; do + # Find any clustermesh-apiserver pod whose UID is NEW (not in the pre-kill + # UID set) AND whose Ready condition is True. + # + # BUG-FIX 2026-05-13a: original kubectl jsonpath nested `[?]` filter is + # broken — switched to shell-side filter listing all pods. + # + # BUG-FIX 2026-05-13b: original filter compared against a SINGLE killed-pod + # UID. With HA replicas>1 (scenario #7), the surviving N-1 replicas already + # have different UIDs and are Ready, so the filter would match one of them + # instantly → false `recovered after 0s`. Rubber-duck critique blocker #2. + # Fix: filter against the pre-kill UID set (every pod present at kill time), + # so only a genuinely new replacement pod passes. + ALL_PODS=$("${KUBECTL}" -n kube-system get pods \ + -l k8s-app=clustermesh-apiserver \ + -o 'jsonpath={range .items[*]}{.metadata.name}={.metadata.uid}={.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \ + 2>/dev/null | grep -v '^$' | grep '=True$') + CANDIDATE="" + if [ -n "${ALL_PODS}" ]; then + while IFS= read -r _line; do + [ -z "${_line}" ] && continue + # _line format: name=uid=True + _name_uid="${_line%=*}" # name=uid + _uid="${_name_uid#*=}" # uid + _in_set=0 + for _old_uid in ${PRE_KILL_UIDS}; do + if [ "${_uid}" = "${_old_uid}" ]; then + _in_set=1 + break + fi + done + if [ "${_in_set}" -eq 0 ]; then + CANDIDATE="${_line}" + break + fi + done <&1 || true + echo "--- describe ALL clustermesh-apiserver pods ---" + for p in $("${KUBECTL}" -n kube-system get pods -l k8s-app=clustermesh-apiserver -o name 2>/dev/null); do + echo "--- $p ---" + "${KUBECTL}" -n kube-system describe "$p" 2>&1 || true + done + echo "--- recent kube-system events ---" + "${KUBECTL}" -n kube-system get events --sort-by=.lastTimestamp 2>&1 | tail -50 || true + } 2>&1 | tee -a "${DIAG_LOG}" + echo "apiserver-failure-killer: diag dump written to ${DIAG_LOG}" + write_timing "${T0}" 0 false "${POD_NAME}" "${POD_UID}" "" "recovery timeout" + # Phase 4b: exit 0 on timeout (NOT 1). The timing JSON with + # `recovered:false` is the load-bearing signal that the scenario was + # attempted but did not recover within budget — Kusto queries on + # ApiserverFailureRecoveryTiming.recovered will flag this. Exiting 1 + # here would cascade-fail the CL2 step → execute.yml's overall_rc=1 → + # share-infra step exits with SucceededWithIssues at worst, but + # peer-cluster measurements (which DID gather data about the failure + # event) would also be wasted. Soft-fail is correct: rubber-duck + # critique #10 confirmed. + exit 0 +fi + +DUR=$((T1 - T0)) +echo "apiserver-failure-killer: recovered after ${DUR}s; new pod ${NEW_POD_NAME} uid=${NEW_POD_UID}" +write_timing "${T0}" "${T1}" true "${POD_NAME}" "${POD_UID}" "${NEW_POD_UID}" "ok" +exit 0 diff --git a/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure.yaml b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure.yaml new file mode 100644 index 0000000000..f444e6fd4d --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure.yaml @@ -0,0 +1,231 @@ +name: clustermesh-apiserver-failure + +# Scale scenario #4: ClusterMesh APIServer Failure. +# +# Goal (scale testing.txt line 80-91): validate resilience and recovery +# behavior when ONE clustermesh-apiserver pod dies in a meshed cluster. +# Measure detection time (how fast peers notice), recovery time (how fast +# the pod is replaced + serving), backlog drain time (how fast queues +# clear after recovery). +# +# Single-cluster failure pattern: kill the apiserver pod on a designated +# target cluster (default "clustermesh-1"). Other clusters' CL2 invocations +# run the same script but no-op based on `kubectl config current-context` +# comparison. The target cluster's killer records t0/t1 timestamps in a +# JSON file at the report dir; scale.py collect picks it up and surfaces +# the timing as an `ApiserverFailureRecoveryTiming` row in the JSONL. +# +# Per-cluster Prometheus must be running on every cluster DURING the kill +# for peer-side observations to land. With CL2_MAX_CONCURRENT < mesh_size, +# only some clusters' Prom are active simultaneously; at n=2/n=5 this is +# fine (concurrency=4 default >= cluster count), but at n=20 we may need +# to bump max_concurrent or accept partial peer observation. See plan.md +# Phase 4b notes. +# +# Sequence: +# 1. Annotate workload namespaces (CFP-39876 opt-in). +# 2. Start measurements. +# 3. Deploy PodMonitor + workload (200 pods + global services, same +# pattern as event-throughput). +# 4. Initial WaitForControlledPodsRunning gate. +# 5. Warmup sleep — mesh stabilizes. +# 6. Method:Exec → apiserver-failure-killer.sh. On target cluster: +# verifies pod identity, hard-kills it, waits for new Ready pod, +# writes timing JSON. On non-target clusters: no-op. +# 7. Observation sleep — let detection + recovery happen. +# 8. Settle sleep — backlog drain. +# 9. Gather measurements (mirrors start). +# 10. Teardown. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "60s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "60s"}} +{{$apiserverKillTargetContext := DefaultParam .CL2_APISERVER_KILL_TARGET_CONTEXT "clustermesh-1"}} +{{$apiserverKillRecoveryTimeoutSeconds := DefaultParam .CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS 240}} +{{$apiserverKillObservationSeconds := DefaultParam .CL2_APISERVER_KILL_OBSERVATION_SECONDS 60}} + +{{$workloadGroup := "clustermesh-apiserver-failure"}} +{{$workloadBasename := "apf"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-apf + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- CFP-39876 opt-in: annotate workload namespaces ----- + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-apf" + + # ----- Start measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/apiserver-failure.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Workload deploy + initial settle ----- + - name: Start tracking apiserver-failure Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-apf-initial + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Wait for initial apiserver-failure pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-apf-initial + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before kill + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- KILL APISERVER (target cluster only) ----- + - name: Kill apiserver on target cluster + measurements: + - Identifier: ApiserverFailureKiller + Method: Exec + Params: + streamOutput: true + # Generous timeout: covers warmup-budget-exceeded + recovery_timeout + # + slow pod schedule. Worst-case ~3min. + timeout: 5m + command: + - bash + - /root/perf-tests/clusterloader2/config/apiserver-failure-killer.sh + - "{{$apiserverKillTargetContext}}" + - "{{$apiserverKillRecoveryTimeoutSeconds}}" + + # ----- Observation window: peers detect failure, then see recovery ----- + - name: Observe during failure + recovery + measurements: + - Identifier: ObservationSleep + Method: Sleep + Params: + duration: {{$apiserverKillObservationSeconds}}s + + # ----- Settle: backlog drain post-recovery ----- + - name: Settle for backlog drain + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/apiserver-failure.yaml + params: + action: gather + + # ----- Teardown ----- + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml index 439fdc4e71..bbb6327e92 100644 --- a/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml +++ b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml @@ -47,6 +47,25 @@ tuningSets: qps: {{$apiServerCallsPerSecond}} steps: + # ----- ACNS namespace opt-in (CFP-39876) ----- + # AKS-managed Cilium ships with clustermesh-default-global-namespace=false, + # so workload namespaces need clustermesh.cilium.io/global=true to sync + # their CiliumIdentity/Endpoint/Services across the mesh. Without this, + # cross-cluster propagation metrics are structurally 0. See plan.md + # note #14 + ACNS team confirmation 2026-05-11. + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-et" + # ----- Start measurements ----- - module: path: /modules/measurements/control-plane.yaml diff --git a/modules/python/clusterloader2/clustermesh-scale/config/ha-config-scaler.sh b/modules/python/clusterloader2/clustermesh-scale/config/ha-config-scaler.sh new file mode 100755 index 0000000000..fc91a6fc05 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/ha-config-scaler.sh @@ -0,0 +1,161 @@ +#!/bin/bash +# Scenario #7 (HA Configuration Validation) — scales the clustermesh-apiserver +# Deployment up/down to compare resource overhead, failover behavior, and event +# duplication between single-replica and multi-replica HA configurations. +# +# Unlike apiserver-failure-killer.sh (which targets a single cluster), this +# script runs on EVERY cluster's CL2 instance and scales each cluster's own +# clustermesh-apiserver. Mesh-wide HA is the realistic production config; only +# scaling one cluster would conflate HA-overhead measurements with a +# single-cluster outlier. +# +# Positional args: +# $1 ACTION scale-up | scale-down +# $2 REPLICAS Target replicas count (required for scale-up; ignored for +# scale-down which always restores to 1). +# $3 REPORT_DIR (optional) Path inside the CL2 container where timing JSON +# is written. Defaults to /root/perf-tests/clusterloader2/results. +# +# Output: +# On scale-up only, writes $REPORT_DIR/HAConfigScalingTimings_.json +# with the scale duration, observed spec/ready replicas, and a +# ha_replicas_honored flag (true iff spec==REPLICAS AND ready==REPLICAS at +# the end of a 30s post-rollout poll window — catches ENO revert). +# scale.py collect emits one HAConfigScalingTiming JSONL row per file. +# +# Exit codes: +# 0 — always (soft-fail). Scale-up failures still emit the timing file with +# ha_replicas_honored:false so Kusto queries can flag degraded HA runs. + +set -uo pipefail + +ACTION="${1:?action required: scale-up|scale-down}" +REPLICAS="${2:-1}" +REPORT_DIR="${3:-/root/perf-tests/clusterloader2/results}" + +# kubectl resolution: PATH first, then pre-staged binary (same pattern as +# apiserver-failure-killer.sh and pod-churn-killer.sh). +if command -v kubectl >/dev/null 2>&1; then + KUBECTL=kubectl +elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then + KUBECTL=/root/perf-tests/clusterloader2/config/kubectl + echo "ha-config-scaler: using pre-staged kubectl at ${KUBECTL}" +else + echo "ha-config-scaler ERROR: kubectl not in PATH and pre-staged binary missing" + exit 0 +fi + +CURRENT_CONTEXT=$("${KUBECTL}" config current-context 2>/dev/null || echo "unknown") +mkdir -p "${REPORT_DIR}" +TIMING_FILE="${REPORT_DIR}/HAConfigScalingTimings_${CURRENT_CONTEXT}.json" + +emit_timing() { + # Args: action requested_replicas spec_replicas_after ready_replicas_after honored duration_s note + local action="$1" requested="$2" spec_after="$3" ready_after="$4" + local honored="$5" dur="$6" note="$7" + cat > "${TIMING_FILE}" </dev/null || echo 0) + ready=$("${KUBECTL}" -n kube-system get deployment clustermesh-apiserver \ + -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo 0) + echo "${spec:-0} ${ready:-0}" +} + +T0=$(date +%s) + +case "${ACTION}" in + scale-up) + echo "ha-config-scaler: scale-up clustermesh-apiserver to ${REPLICAS} replicas on ${CURRENT_CONTEXT}" + if ! "${KUBECTL}" -n kube-system scale deployment clustermesh-apiserver \ + --replicas="${REPLICAS}" >/dev/null 2>&1; then + echo "ha-config-scaler WARN: kubectl scale command failed" + emit_timing "scale-up" "${REPLICAS}" 0 0 false 0 "kubectl scale failed" + exit 0 + fi + + # Phase 1: wait for spec.replicas==REPLICAS AND status.readyReplicas==REPLICAS. + # 240s budget covers initial image pull + ENI attach on AKS-managed Cilium + # (we observed 30-60s pod schedule + 60s pull for single-pod recovery; HA + # rollouts are sequential per RollingUpdate strategy). + ROLLOUT_DEADLINE=$((T0 + 240)) + spec=0 + ready=0 + while [ "$(date +%s)" -lt "${ROLLOUT_DEADLINE}" ]; do + read -r spec ready <<<"$(get_spec_ready)" + if [ "${spec}" -eq "${REPLICAS}" ] && [ "${ready}" -eq "${REPLICAS}" ]; then + break + fi + sleep 2 + done + + if [ "${spec}" -ne "${REPLICAS}" ] || [ "${ready}" -ne "${REPLICAS}" ]; then + T1=$(date +%s) + DUR=$((T1 - T0)) + echo "ha-config-scaler WARN: rollout did not reach ${REPLICAS} replicas after ${DUR}s (spec=${spec} ready=${ready})" + emit_timing "scale-up" "${REPLICAS}" "${spec}" "${ready}" false "${DUR}" "rollout timeout" + exit 0 + fi + + # Phase 2: ENO-revert detection. AKS-managed Cilium tags the Deployment + # with `app.kubernetes.io/actually-managed-by=Eno`; the ENO operator + # reconciles to desired state on its own cadence. If it reverts our + # scale within 30s of rollout completion, the rest of the scenario will + # run on degraded replicas — useful to record but not useful for HA A/B + # comparison. + REVERT_DEADLINE=$(($(date +%s) + 30)) + honored=true + final_spec=${spec} + final_ready=${ready} + while [ "$(date +%s)" -lt "${REVERT_DEADLINE}" ]; do + read -r final_spec final_ready <<<"$(get_spec_ready)" + if [ "${final_spec}" -ne "${REPLICAS}" ]; then + honored=false + echo "ha-config-scaler WARN: ENO reverted scale within 30s — spec=${final_spec}" + break + fi + sleep 2 + done + + T1=$(date +%s) + DUR=$((T1 - T0)) + NOTE="ok" + [ "${honored}" = "false" ] && NOTE="enor_reverted" + emit_timing "scale-up" "${REPLICAS}" "${final_spec}" "${final_ready}" "${honored}" "${DUR}" "${NOTE}" + echo "ha-config-scaler: scale-up complete in ${DUR}s, spec=${final_spec} ready=${final_ready} honored=${honored}" + ;; + + scale-down) + echo "ha-config-scaler: scale-down clustermesh-apiserver to 1 replica on ${CURRENT_CONTEXT} (cleanup)" + # Best-effort. Failure here is non-blocking — the cluster is about to be + # destroyed anyway. We do NOT overwrite the scale-up timing JSON. + "${KUBECTL}" -n kube-system scale deployment clustermesh-apiserver \ + --replicas=1 >/dev/null 2>&1 || true + read -r spec ready <<<"$(get_spec_ready)" + echo "ha-config-scaler: scale-down attempted; current spec=${spec} ready=${ready}" + ;; + + *) + echo "ha-config-scaler ERROR: unknown action '${ACTION}' (expected scale-up|scale-down)" + exit 0 + ;; +esac + +exit 0 diff --git a/modules/python/clusterloader2/clustermesh-scale/config/ha-config.yaml b/modules/python/clusterloader2/clustermesh-scale/config/ha-config.yaml new file mode 100644 index 0000000000..c0f812a81b --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/ha-config.yaml @@ -0,0 +1,264 @@ +name: clustermesh-ha-config + +# Scale scenario #7: HA Configuration Validation. +# +# Goal (scale testing.txt line 115-126): compare identical workloads with +# clustermesh-apiserver replicas=1 (baseline = scenario #4) vs replicas=N +# (HA on). Measure resource overhead, failover time, event duplication. +# +# Design: this scenario clones scenario #4 (apiserver-failure) and adds two +# new pre/post steps: +# - BEFORE measurements start: scale clustermesh-apiserver Deployment on +# EVERY cluster to CL2_HA_CONFIG_REPLICAS (default 3). Mesh-wide HA is +# the realistic production config; scaling only the target cluster would +# conflate HA-overhead measurements with single-cluster outliers. +# - AFTER gather: scale back to 1 replica (cleanup). Best-effort; the +# cluster is destroyed shortly after anyway. +# +# Cross-scenario A/B in Kusto: query rows where test_type in +# ("apiserver-failure","ha-config"), join on cluster + measurement. +# +# - apiserver-failure-killer.sh is reused for the kill phase. It correctly +# handles HA replicas now (pre-kill UID set capture + Ready filter against +# that set — see commit "phase 4b: fix apiserver-failure killer +# false-success with HA replicas"). +# - ha-config-scaler.sh handles the scale-up/scale-down + ENO-revert +# detection (timing JSON tags ha_replicas_honored true|false). +# +# Sequence: +# 1. Annotate workload namespaces (CFP-39876 opt-in). +# 2. HA SCALE-UP: every cluster scales clustermesh-apiserver to N replicas. +# 3. Start measurements. +# 4. Deploy PodMonitor + workload (200 pods + global services). +# 5. Initial WaitForControlledPodsRunning gate. +# 6. Warmup sleep. +# 7. Method:Exec → apiserver-failure-killer.sh. On target cluster: kills +# ONE of N pods; survivors should continue serving (HA invariant). +# On non-target clusters: no-op. +# 8. Observation sleep. +# 9. Settle sleep. +# 10. Gather measurements. +# 11. HA SCALE-DOWN: every cluster scales back to 1 (cleanup). +# 12. Teardown. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "60s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "60s"}} +{{$apiserverKillTargetContext := DefaultParam .CL2_APISERVER_KILL_TARGET_CONTEXT "clustermesh-1"}} +{{$apiserverKillRecoveryTimeoutSeconds := DefaultParam .CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS 240}} +{{$apiserverKillObservationSeconds := DefaultParam .CL2_APISERVER_KILL_OBSERVATION_SECONDS 60}} +{{$haConfigReplicas := DefaultParam .CL2_HA_CONFIG_REPLICAS 3}} + +{{$workloadGroup := "clustermesh-ha-config"}} +{{$workloadBasename := "ha"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-ha + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- CFP-39876 opt-in: annotate workload namespaces ----- + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-ha" + + # ----- HA scale-up (BEFORE start measurements so steady-state HA values + # are captured). Every cluster scales its own clustermesh-apiserver. + - name: Scale clustermesh-apiserver to HA replicas + measurements: + - Identifier: HAConfigScaler-up + Method: Exec + Params: + streamOutput: true + # Generous timeout: 240s rollout + 30s revert-check + slack. + timeout: 6m + command: + - bash + - /root/perf-tests/clusterloader2/config/ha-config-scaler.sh + - scale-up + - "{{$haConfigReplicas}}" + + # ----- Start measurements (with HA replicas already in place) ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/apiserver-failure.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Workload deploy + initial settle ----- + - name: Start tracking ha-config Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-ha-initial + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Wait for initial ha-config pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-ha-initial + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before kill + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- KILL one of N apiserver replicas (target cluster only) ----- + - name: Kill apiserver on target cluster (1 of N replicas) + measurements: + - Identifier: ApiserverFailureKiller + Method: Exec + Params: + streamOutput: true + timeout: 5m + command: + - bash + - /root/perf-tests/clusterloader2/config/apiserver-failure-killer.sh + - "{{$apiserverKillTargetContext}}" + - "{{$apiserverKillRecoveryTimeoutSeconds}}" + + # ----- Observation: HA invariant should keep remote-clusters-connected + # at max (cluster_count-1) throughout; scenario #4 baseline dips during + # the kill window. + - name: Observe during failure + recovery (HA invariant test) + measurements: + - Identifier: ObservationSleep + Method: Sleep + Params: + duration: {{$apiserverKillObservationSeconds}}s + + - name: Settle for backlog drain + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements (HA still active) ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/apiserver-failure.yaml + params: + action: gather + + # ----- HA scale-down (cleanup) ----- + - name: Scale clustermesh-apiserver back to 1 replica + measurements: + - Identifier: HAConfigScaler-down + Method: Exec + Params: + streamOutput: true + timeout: 3m + command: + - bash + - /root/perf-tests/clusterloader2/config/ha-config-scaler.sh + - scale-down + + # ----- Teardown ----- + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/isolation-churn.sh b/modules/python/clusterloader2/clustermesh-scale/config/isolation-churn.sh new file mode 100755 index 0000000000..4dbf293386 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/isolation-churn.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Scenario #5 (Multi-Cluster Failure Isolation) — drives heavy pod-churn on +# ONLY the target cluster; peer clusters run a no-op observe path that +# sleeps for the same duration so their CL2 lifecycle (and Prometheus +# scrape window) covers the target's churn period. +# +# Why peer must sleep (not exit immediately): in share-infra mode, each +# scenario runs CL2 in parallel on every cluster. If peer exits the +# Method:Exec at t=0s, peer CL2 advances straight into settle + gather + +# teardown, finishing in ~3min — but target is still churning at t=10min. +# Peer Prometheus is torn down before target's churn finishes. To compare +# "did peers spike while target churned?" the peer Prometheus window must +# overlap target's churn window. Sleeping in this script keeps both +# lifecycles aligned. +# +# Positional args (all forwarded to pod-churn-killer.sh on target): +# $1 TARGET_CONTEXT kubectl context name of the cluster to churn. +# $2 KILL_DURATION_SECONDS Total kill-loop runtime on target (also peer sleep). +# $3 KILL_INTERVAL_SECONDS Seconds between kill rounds on target. +# $4 KILL_BATCH Pods deleted per round on target. +# $5 WORKLOAD_GROUP Label-selector group value for pod selection. +# +# Exit codes: +# 0 — always (target completes normally OR peer no-op observes for the +# configured duration). Soft-fail matches the rest of Phase 4b's +# scenario scripts so a single-cluster issue doesn't abort the run. + +set -uo pipefail + +TARGET_CONTEXT="${1:?target context required}" +KILL_DURATION_SECONDS="${2:-600}" +KILL_INTERVAL_SECONDS="${3:-10}" +KILL_BATCH="${4:-5}" +WORKLOAD_GROUP="${5:-clustermesh-isolation}" + +# kubectl resolution: PATH first, then pre-staged binary (same pattern as +# apiserver-failure-killer.sh and pod-churn-killer.sh). +if command -v kubectl >/dev/null 2>&1; then + KUBECTL=kubectl +elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then + KUBECTL=/root/perf-tests/clusterloader2/config/kubectl + export PATH="/root/perf-tests/clusterloader2/config:${PATH}" + echo "isolation-churn: using pre-staged kubectl at ${KUBECTL}" +else + echo "isolation-churn ERROR: kubectl not in PATH and pre-staged binary missing" + exit 127 +fi + +CURRENT_CONTEXT=$("${KUBECTL}" config current-context 2>/dev/null || echo "unknown") +echo "isolation-churn: current=${CURRENT_CONTEXT} target=${TARGET_CONTEXT}" + +if [ "${CURRENT_CONTEXT}" != "${TARGET_CONTEXT}" ]; then + echo "isolation-churn: peer cluster — observing for ${KILL_DURATION_SECONDS}s while target churns" + sleep "${KILL_DURATION_SECONDS}" + echo "isolation-churn: peer observation window complete" + exit 0 +fi + +echo "isolation-churn: target cluster — delegating to pod-churn-killer.sh" +exec bash /root/perf-tests/clusterloader2/config/pod-churn-killer.sh \ + "${KILL_DURATION_SECONDS}" \ + "${KILL_INTERVAL_SECONDS}" \ + "${KILL_BATCH}" \ + "${WORKLOAD_GROUP}" diff --git a/modules/python/clusterloader2/clustermesh-scale/config/isolation.yaml b/modules/python/clusterloader2/clustermesh-scale/config/isolation.yaml new file mode 100644 index 0000000000..d7882415f1 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/isolation.yaml @@ -0,0 +1,232 @@ +name: clustermesh-isolation + +# Scale scenario #5: Multi-Cluster Failure Isolation. +# +# Goal (scale testing.txt line 92-102): induce heavy churn in ONE cluster, +# verify peer clusters remain stable (no cascade in CPU/memory/etc). +# +# Topology: every cluster runs the same 200-pod workload + global services. +# The Method:Exec kill phase routes to the target cluster (default +# clustermesh-1) only — peer clusters' isolation-churn.sh script sleeps +# for the same kill duration so their Prometheus scrape window aligns +# with target's churn window. Without that alignment, peer CL2 would +# tear down Prometheus before target's churn finishes, destroying the +# isolation signal. +# +# Cross-scenario A/B in Kusto: filter `test_type == "isolation"`, derive +# `role = iff(cluster == "", "target", "peer")`, then +# compare resource measurements across role. Healthy isolation means +# peers' CPU/memory/etc are at baseline values during the churn window; +# cascading failure means peers' resources track target's spikes. +# +# Sequence: +# 1. Annotate workload namespaces (CFP-39876 opt-in). +# 2. Start measurements. +# 3. Deploy 200-pod workload + global services on every cluster. +# 4. Initial WaitForControlledPodsRunning gate. +# 5. Warmup sleep. +# 6. Method:Exec → isolation-churn.sh. On target: runs pod-churn-killer.sh +# kill loop (delete KILL_BATCH random workload pods every +# KILL_INTERVAL_SECONDS for KILL_DURATION_SECONDS). On peers: sleeps +# for KILL_DURATION_SECONDS to keep CL2/Prom lifecycle aligned. +# 7. Settle sleep — backlog drain on target, observe-window close on peers. +# 8. Gather measurements (peers should be flat; target should show spike). +# 9. Teardown. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "60s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "60s"}} + +# Reuse the same target-context knob as scenario #4 (apiserver-failure): +# both scenarios target the same cluster by convention. Override via the +# matrix var if a different target is needed. +{{$apiserverKillTargetContext := DefaultParam .CL2_APISERVER_KILL_TARGET_CONTEXT "clustermesh-1"}} + +# Reuse the pod-churn kill-loop knobs from scenario #2 (pod-churn-combined): +# semantically identical (kill workload pods at controlled rate). Avoids +# adding new matrix vars for the same parameter shape. +{{$killDurationSeconds := DefaultParam .CL2_KILL_DURATION_SECONDS 600}} +{{$killIntervalSeconds := DefaultParam .CL2_KILL_INTERVAL_SECONDS 10}} +{{$killBatch := DefaultParam .CL2_KILL_BATCH 5}} +# Method:Exec timeout — kill duration + 5min headroom (allows peer's sleep +# to complete + final pod-churn-killer cleanup line). +{{$killExecTimeout := DefaultParam .CL2_KILL_EXEC_TIMEOUT "15m"}} + +{{$workloadGroup := "clustermesh-isolation"}} +{{$workloadBasename := "iso"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-iso + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- CFP-39876 opt-in: annotate workload namespaces ----- + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-iso" + + # ----- Start measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Workload deploy + initial settle ----- + - name: Start tracking isolation Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-iso-initial + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Wait for initial isolation pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-iso-initial + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before isolation churn + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- ISOLATION CHURN (target-only kill loop; peers sleep-observe) ----- + - name: Drive heavy pod-churn on target cluster only + measurements: + - Identifier: IsolationChurnRunner + Method: Exec + Params: + streamOutput: true + timeout: {{$killExecTimeout}} + command: + - bash + - /root/perf-tests/clusterloader2/config/isolation-churn.sh + - "{{$apiserverKillTargetContext}}" + - "{{$killDurationSeconds}}" + - "{{$killIntervalSeconds}}" + - "{{$killBatch}}" + - "{{$workloadGroup}}" + + # ----- Settle: backlog drain on target, observe-window close on peers ----- + - name: Settle after isolation churn + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements (peer flat-vs-target spike comparison) ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: gather + + # ----- Teardown ----- + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml index 0e0a3e36bd..b192bd3709 100644 --- a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml @@ -22,6 +22,13 @@ name: clustermesh-event-throughput-workload {{$replicasPerDeployment := .replicasPerDeployment}} {{$tuningSet := .tuningSet}} {{$operationTimeout := .operationTimeout}} +# Optional suffix for measurement Identifiers. Scenario #6 (upper-bound) +# calls this module N times per CL2 run (one per saturation rung) with +# phaseSuffix=Rung0/Rung1/.../RungN-1 so the WaitForControlledPodsRunning +# Identifiers don't collide across rungs. Default "" keeps existing +# single-invocation callers (event-throughput.yaml) byte-for-byte +# identical. +{{$phaseSuffix := DefaultParam .phaseSuffix ""}} # delete = bring object count to 0; create/restart keep configured count. {{$replicasInPhase := $deploymentsPerNamespace}} @@ -34,9 +41,9 @@ steps: # Identifier keeps the create/restart/delete invocations from clobbering # each other's metric state across the three module calls in # event-throughput.yaml. - - name: Start tracking event-throughput pods to be {{$actionName}}d + - name: Start tracking event-throughput pods to be {{$actionName}}d{{if $phaseSuffix}} ({{$phaseSuffix}}){{end}} measurements: - - Identifier: WaitForControlledPodsRunning-{{$actionName}} + - Identifier: WaitForControlledPodsRunning-{{$actionName}}{{$phaseSuffix}} Method: WaitForControlledPodsRunning Params: action: start @@ -65,9 +72,9 @@ steps: templateFillMap: Group: clustermesh-event-throughput - - name: Wait for event-throughput pods to be {{$actionName}}d + - name: Wait for event-throughput pods to be {{$actionName}}d{{if $phaseSuffix}} ({{$phaseSuffix}}){{end}} measurements: - - Identifier: WaitForControlledPodsRunning-{{$actionName}} + - Identifier: WaitForControlledPodsRunning-{{$actionName}}{{$phaseSuffix}} Method: WaitForControlledPodsRunning Params: action: gather diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/apiserver-failure.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/apiserver-failure.yaml new file mode 100644 index 0000000000..9bc2234291 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/apiserver-failure.yaml @@ -0,0 +1,97 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +# Scale scenario #4 (ClusterMesh APIServer Failure) — measurements scoped +# to the failure window. Captures peer-cluster behavior (drop in +# remote_clusters gauge, spike in failure-counter rate, kvstore catch-up +# latency) over the run window. The actual t0/t1 timestamps come from +# apiserver-failure-killer.sh's timing JSON file (collected separately). +# +# PromQL note on time-of-event signals: vanilla Prometheus doesn't expose +# "time at which X first happened" cleanly. Detection time and recovery +# time are computed post-hoc in Kusto by joining these gauge series with +# the killer's t0/t1 timestamps. This module captures the windowed +# aggregates that surface "something disruptive happened" — the explicit +# timing comes from the timing JSON row. + +steps: + - name: {{$action}} ApiServer Failure Measurements + measurements: + # ----------------------------------------------------------------- + # Detection signal: how low did the remote_clusters gauge dip during + # the failure window? Healthy = N-1 (every cluster sees its N-1 peers). + # Target's apiserver dies → peer clusters' gauge drops by 1 briefly → + # gauge recovers when apiserver is back + reconnects. + # ----------------------------------------------------------------- + - Identifier: RemoteClustersConnectedMinDuringFailure{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Remote Clusters Connected Min During Failure {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Min + query: min(min_over_time(cilium_clustermesh_remote_clusters[%v:])) + - name: Perc50 + query: quantile(0.50, min_over_time(cilium_clustermesh_remote_clusters[%v:])) + + # ----------------------------------------------------------------- + # Failure-counter rate burst: cilium_clustermesh_remote_cluster_failures + # is a monotonic counter. During the failure window, the rate spikes + # as peers retry connections to the dead apiserver. Max-over-time of + # the 1m-sliding rate is the "peak failure rate" signal. + # ----------------------------------------------------------------- + - Identifier: RemoteClusterFailureRateBurst{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Remote Cluster Failure Rate Burst {{$suffix}} + metricVersion: v1 + unit: failures/s + enableViolations: false + queries: + - name: Max + query: max(max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:])) + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:])) + + # ----------------------------------------------------------------- + # Kvstore sync error burst: spikes when peers can't reach the dead + # apiserver. Catch-up rate post-recovery indicates backlog drain + # behavior. + # ----------------------------------------------------------------- + - Identifier: KvstoreSyncErrorBurst{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Sync Error Burst {{$suffix}} + metricVersion: v1 + unit: errors/s + enableViolations: false + queries: + - name: Max + query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_sync_errors_total[1m])[%v:])) + - name: Sum + query: sum(max_over_time(rate(cilium_kvstoremesh_kvstore_sync_errors_total[1m])[%v:])) + + # ----------------------------------------------------------------- + # Kvstore operation latency p99 during recovery: peers re-sync state + # after apiserver comes back; the histogram's p99 spike size is the + # "catch-up cost" signal. + # ----------------------------------------------------------------- + - Identifier: KvstoreOperationLatencyP99DuringRecovery{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Op Latency P99 During Recovery {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: false + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le)) diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml index 18d0a2a85c..7f5c9c6cf3 100644 --- a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml @@ -38,6 +38,17 @@ steps: # Mesh failure counter: cumulative remote-cluster connection failures. # Healthy runs should keep this at 0; we track the max increase observed # over the run to surface flapping links during scale-up. + # + # Observed N=20 baseline (run 66826-8f280609): MaxIncrease = 4–6 on + # EVERY cluster — even green runs. Hypothesis is Fleet pushing peer + # config updates mid-run briefly bounces connections. To distinguish + # "5 failures spread across 5 peers" from "5 failures all against ONE + # bad peer", PerPeerMaxIncrease below preserves the target_cluster + # label and reports the max-failure peer per focal cluster. If the two + # numbers match, failures are concentrated on a single peer (real + # peering issue); if PerPeerMaxIncrease ≈ 1 with MaxIncrease ≈ 5, + # failures are uniformly distributed (Fleet churn, not peering bug). + # See todo remote-cluster-failures-investigation. # --------------------------------------------------------------------- - Identifier: ClusterMeshRemoteClusterFailures{{$suffix}} Method: GenericPrometheusQuery @@ -50,6 +61,16 @@ steps: queries: - name: MaxIncrease query: max(max_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) - min(min_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) + # Max failures observed against any single peer cluster. Reported + # per scrape series (preserving target_cluster label inside the + # subquery), then we take the worst peer with quantile(0.99,...). + - name: PerPeerMaxIncrease + query: quantile(0.99, max_over_time(cilium_clustermesh_remote_cluster_failures[%v:]) - min_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) + # Median peer's failure count — if this is also ≈ MaxIncrease, every + # peer is failing roughly equally; if it's near 0, failures are + # heavily concentrated on a few outlier peers. + - name: PerPeerMedianIncrease + query: quantile(0.50, max_over_time(cilium_clustermesh_remote_cluster_failures[%v:]) - min_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) # --------------------------------------------------------------------- # Cross-cluster event throughput — the headline metric for scale scenario @@ -65,67 +86,120 @@ steps: unit: events/s enableViolations: false queries: + # Subquery step explicitly set to 30s (matches Prometheus scrape + # interval) so brief workload-create bursts aren't smoothed away by + # the default 1m subquery step. - name: Perc99 - query: quantile(0.99, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + query: quantile(0.99, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:30s])) - name: Perc90 - query: quantile(0.90, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + query: quantile(0.90, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:30s])) - name: Perc50 - query: quantile(0.50, avg_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + query: quantile(0.50, avg_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:30s])) + # Cumulative event count over the run window. Range vector `[%v]` + # (NOT subquery `[%v:]`) — `increase()` with a subquery uses the + # subquery step to sample the counter, which at default 1m step + # misses brief bursts (events all fall between samples → first and + # last subquery samples both show post-burst peak count → delta=0). + # Range vector reads at Prometheus's actual scrape resolution. + - name: TotalIncrease + query: sum(increase(cilium_kvstoremesh_kvstore_events_queue_seconds_count[%v])) # --------------------------------------------------------------------- - # Per-type event rate breakdown (spec line 131: "Event rate (per - # type)"). The kvstoremesh kvstore-events histogram carries a - # `scope` label tagging which kvstore key family the event touched. - # We split into the three families spec line 5 calls out: endpoints, - # services, identities. Cilium 1.18 uses these scope values: - # identities/v1 — security identities - # services/v1 — global Service objects - # ip/v1 — endpoint IP-to-identity mappings (endpoints) - # nodes/v1 — node tunnel / IPAM advertisements - # serviceexports/v1 — MCS-API ServiceExport objects - # lease — leader election - # cilium/.heartbeat — kvstore liveness heartbeat - # cilium/syncedcanaries — initial-sync barrier markers - # --------------------------------------------------------------------- - - Identifier: ClusterMeshKvstoreEventsRateIdentities{{$suffix}} + # Per-type cross-cluster events (spec line 5: "How many cross-cluster + # events (endpoints, services, identities) can be processed per cluster + # and per mesh"). Reports the cumulative count of kvstore events + # observed by THIS cluster's kvstoremesh during the test, broken down + # by scope label. + # + # Ground-truth scope values (verified via runtime probe on AKS-managed + # Cilium): + # ip/v1 — endpoint (pod IP-to-identity) propagation events + # services/v1 — global Service objects (incl. their backends) + # identities/v1 — security identity additions/removals + # nodes/v1 — node tunnel / IPAM advertisements + # serviceexports/v1 — MCS-API ServiceExport (rare in our workload) + # cilium/.hear*, cilium/synce*, cilium/.init*, lease — meta scopes + # (heartbeat / synced canaries / init lock / leader election) + # + # Why instant `sum()` instead of `increase()` or `rate()`: + # `cilium_kvstoremesh_kvstore_events_queue_seconds_count` is a + # counter labelled by scope. In Prometheus convention a labelled + # counter only EXISTS as a series once the labelled event has + # occurred at least once. The per-scope events of interest + # (services/v1, identities/v1, ip/v1, nodes/v1) only fire during + # the workload-create burst at test start. Before the burst: + # no series, no scrapes, no baseline. After the burst: counter + # appears at the post-burst plateau value (e.g. 80) and stays + # flat for the rest of the test. `increase(metric[%v])` over a + # series whose first sample IS the plateau cannot compute a delta + # to a non-existent pre-burst sample, so it returns 0. + # + # We tried two workarounds (commit history) before settling on + # instant `sum()`: + # - Tightening the subquery step from default 1m to 30s: didn't + # help — still no pre-burst sample. + # - Adding a 90s pre-workload settle step (commit 380d34c): didn't + # help — Prometheus had time to discover the PodMonitor target, + # but the per-scope SERIES still didn't exist until the burst. + # + # Since each test run uses freshly-provisioned clusters (counter + # starts at 0), CurrentValue at gather time IS the cumulative count + # of events observed during this test. That directly answers spec + # line 5's "How many events" wording. + # + # The aggregate `ClusterMeshKvstoreEventsRate` query above DOES + # work because the heartbeat scope (`cilium/.hear*`) increments + # every ~5s from cluster bring-up onward — so Prometheus has many + # pre-burst samples for the aggregate vector to compute rate over. + # + # For per-scope rate signal (events/sec), Cilium would need to + # pre-emit zero-valued counters for known scopes at startup, which + # it doesn't do today (would require an upstream PR). + # --------------------------------------------------------------------- + - Identifier: ClusterMeshKvstoreEventsTotalIdentities{{$suffix}} Method: GenericPrometheusQuery Params: action: {{$action}} - metricName: ClusterMesh Kvstore Events Rate Identities {{$suffix}} + metricName: ClusterMesh Kvstore Events Total Identities {{$suffix}} metricVersion: v1 - unit: events/s + unit: events enableViolations: false queries: - - name: Perc99 - query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:])) - - name: Perc50 - query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:])) - - Identifier: ClusterMeshKvstoreEventsRateServices{{$suffix}} + - name: TotalCount + query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}) + - Identifier: ClusterMeshKvstoreEventsTotalServices{{$suffix}} Method: GenericPrometheusQuery Params: action: {{$action}} - metricName: ClusterMesh Kvstore Events Rate Services {{$suffix}} + metricName: ClusterMesh Kvstore Events Total Services {{$suffix}} metricVersion: v1 - unit: events/s + unit: events enableViolations: false queries: - - name: Perc99 - query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:])) - - name: Perc50 - query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:])) - - Identifier: ClusterMeshKvstoreEventsRateEndpoints{{$suffix}} + - name: TotalCount + query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}) + - Identifier: ClusterMeshKvstoreEventsTotalEndpoints{{$suffix}} Method: GenericPrometheusQuery Params: action: {{$action}} - metricName: ClusterMesh Kvstore Events Rate Endpoints {{$suffix}} + metricName: ClusterMesh Kvstore Events Total Endpoints {{$suffix}} metricVersion: v1 - unit: events/s + unit: events enableViolations: false queries: - - name: Perc99 - query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:])) - - name: Perc50 - query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:])) + - name: TotalCount + query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}) + - Identifier: ClusterMeshKvstoreEventsTotalNodes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Events Total Nodes {{$suffix}} + metricVersion: v1 + unit: events + enableViolations: false + queries: + - name: TotalCount + query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="nodes/v1"}) # --------------------------------------------------------------------- # Cross-cluster propagation latency proxy: p99 of kvstore operation @@ -190,3 +264,61 @@ steps: query: quantile(0.99, max_over_time(cilium_identity[%v:])) - name: Perc50 query: quantile(0.50, avg_over_time(cilium_identity[%v:])) + + # --------------------------------------------------------------------- + # Scenario #7 (HA Configuration) — clustermesh-apiserver pod resource + # overhead. With replicas=1 (baseline scenarios #1-#6) the Total metrics + # equal the single-pod values; with replicas=N (scenario #7 / ha-config) + # they reflect the cumulative cost of N replicas. Direct A/B in Kusto: + # compare `test_type in ("apiserver-failure","ha-config")` rows. + # + # Scoped to label `pod=~"clustermesh-apiserver-.*"` which matches every + # pod under the Deployment (ReplicaSet hash + suffix). Source is cAdvisor + # (kubelet metrics), which the CL2 prometheus stack scrapes by default. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshApiserverPodCPU{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh APIServer Pod CPU {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: false + queries: + - name: TotalMax + query: max_over_time(sum(rate(container_cpu_usage_seconds_total{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}[1m]))[%v:]) + - name: TotalAvg + query: avg_over_time(sum(rate(container_cpu_usage_seconds_total{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}[1m]))[%v:]) + - name: PerPodMax + query: max_over_time(max(sum by (pod) (rate(container_cpu_usage_seconds_total{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}[1m])))[%v:]) + + - Identifier: ClusterMeshApiserverPodMemory{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh APIServer Pod Memory {{$suffix}} + metricVersion: v1 + unit: bytes + enableViolations: false + queries: + - name: TotalMax + query: max_over_time(sum(container_memory_working_set_bytes{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"})[%v:]) + - name: TotalAvg + query: avg_over_time(sum(container_memory_working_set_bytes{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"})[%v:]) + - name: PerPodMax + query: max_over_time(max(sum by (pod) (container_memory_working_set_bytes{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}))[%v:]) + + - Identifier: ClusterMeshApiserverPodRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh APIServer Pod Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Total + query: max_over_time(sum(kube_pod_container_status_restarts_total{pod=~"clustermesh-apiserver-.*"})[%v:]) + - name: PerPodMax + query: max_over_time(max(sum by (pod) (kube_pod_container_status_restarts_total{pod=~"clustermesh-apiserver-.*"}))[%v:]) + diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml index 47504cbf89..d74b9992d6 100644 --- a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml @@ -54,6 +54,51 @@ steps: query: quantile(0.90, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) - name: Perc50 query: quantile(0.50, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + # --------------------------------------------------------------------- + # Per-pod normalized apiserver CPU. The two ApiserverAvg/MaxCPUUsage + # measurements above use the team-wide shared PromQL pattern (copied + # across large_cluster / network-scale / slo / network-load) which + # implicitly aggregates across whatever series match + # `endpoint="apiserver"` — so the resulting "cores" value is actually + # a Prometheus rate aggregate, not literal cores per pod. + # + # This duplicate measurement adds explicit `sum by(pod)` grouping so + # we get a per-pod value (i.e. genuine cores) AND `quantile(0.99)` + # then picks the most-loaded pod. If the underlying scrape doesn't + # carry a `pod` label, sum-by collapses to one series and the + # measurement still yields a usable cross-cluster number. + # + # Kept SEPARATE from the shared-pattern measurements so dashboards + # comparing across scenarios still see the same column names from + # the originals; we just gain an honest per-pod column on top. + # See todo apiserver-cpu-promql-fix. + # --------------------------------------------------------------------- + - Identifier: ApiserverAvgCPUPerPod{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Avg CPU Per Pod {{$suffix}} + metricVersion: v1 + unit: cores + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, sum by(pod) (avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))) + - name: Perc50 + query: quantile(0.50, sum by(pod) (avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))) + - Identifier: ApiserverMaxCPUPerPod{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Max CPU Per Pod {{$suffix}} + metricVersion: v1 + unit: cores + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, sum by(pod) (max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))) + - name: Perc50 + query: quantile(0.50, sum by(pod) (max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))) - Identifier: ApiserverAvgMemUsage{{$suffix}} Method: GenericPrometheusQuery Params: diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/node-churn.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/node-churn.yaml new file mode 100644 index 0000000000..369982624c --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/node-churn.yaml @@ -0,0 +1,185 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +# Scale scenario #3 (Node Churn / IP Churn) — measurements layered on top +# of clustermesh-metrics.yaml + cilium.yaml. These queries surface the +# spec-required signals (scale testing.txt:78-79): +# +# * IP update propagation — kvstore event rates broken out by scope so +# node/IP scope events are visible separately from identity/service +# scope. Under node-churn, node-scope events should burst when nodes +# drain/replace; identity-scope events should stay flat (identity is +# label-keyed, not IP-keyed). +# * Temporary inconsistency windows — node Ready transitions, pod +# eviction rate, remote-cluster endpoint cardinality on peers +# (whether peers observe the target's IP churn fully). +# +# Rubber-duck design review #5 + #6: cilium_identity_count is a weak +# signal under node-churn (identities don't churn when only IPs change). +# Dropped in favor of kvstore-scope rates + remote endpoint cardinality. + +steps: + - name: {{$action}} Node Churn Measurements + measurements: + + # ----------------------------------------------------------------- + # NODE READY TRANSITIONS. changes() over a counter-like series of + # node-condition states counts the number of Ready/NotReady flips + # during the window. Healthy scale-cycle: 2N transitions per cycle + # (N nodes drain + N nodes ready). Replace: ≥ K (drained + new). + # Spec line 79 "Temporary inconsistency windows": this is the + # local-cluster view of how long nodes stayed un-Ready. + # ----------------------------------------------------------------- + - Identifier: NodeReadyTransitions{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Ready Transitions {{$suffix}} + metricVersion: v1 + unit: count + enableViolations: false + queries: + - name: ReadyTransitionsTotal + query: sum(changes(kube_node_status_condition{condition="Ready",status="true"}[%v:])) + - name: NotReadyTransitionsTotal + query: sum(changes(kube_node_status_condition{condition="Ready",status="false"}[%v:])) + + # ----------------------------------------------------------------- + # NODE CARDINALITY OVER TIME — gauge for node-info series counts the + # nodes visible to kube-state-metrics. min/max over the window flag + # the scaling delta (e.g., max=25 vs min=20 → +5 scale-up observed). + # NodeCount must trend back to OriginalCount by gather time (the + # finalizer guarantees it on target; peers see only their own static + # pool unaffected by target's churn). + # ----------------------------------------------------------------- + - Identifier: NodeCardinality{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Node Cardinality {{$suffix}} + metricVersion: v1 + unit: count + enableViolations: false + queries: + - name: Min + query: min_over_time(count(kube_node_info)[%v:]) + - name: Max + query: max_over_time(count(kube_node_info)[%v:]) + - name: Last + query: count(kube_node_info) + + # ----------------------------------------------------------------- + # POD EVICTION / RESCHEDULE RATE. Pods on a drained or deleted node + # get NodeLost (kubelet evicts) or Evicted (kube-controller forcibly + # rescheduled). Rate over the window: target should spike during + # ops; peers stay near 0 (no node churn there). + # ----------------------------------------------------------------- + - Identifier: PodEvictionRate{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Pod Eviction Rate {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: NodeLostMax + query: max(max_over_time(rate(kube_pod_status_reason{reason="NodeLost"}[1m])[%v:])) + - name: EvictedMax + query: max(max_over_time(rate(kube_pod_status_reason{reason="Evicted"}[1m])[%v:])) + + # ----------------------------------------------------------------- + # KVSTORE EVENT RATES BY SCOPE — the headline propagation signal. + # cilium_kvstoremesh_kvstore_events_queue_seconds_count carries a + # `scope` label (verified runtime-probed in Phase 2: nodes/v1, ip/v1, + # identities/v1, endpoints/v1, services/v1). + # + # Under node-churn the EXPECTED splits are: + # nodes/v1 → burst (each scale/replace op churns N node entries) + # ip/v1 → burst (each new VM gets a new IP entry) + # identities/v1→ near-zero (workload pods keep same labels) + # endpoints/v1 → burst (pods reschedule with new pod IPs) + # services/v1 → near-zero (service definitions stable) + # + # Cross-scenario Kusto query: filter by scope, compare target vs peer + # rate. Peer rates indicate "did target's node churn propagate to + # peers' kvstore" — the spec "IP update propagation" signal. + # ----------------------------------------------------------------- + - Identifier: KvstoreNodeScopeEventRate{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Kvstore Node Scope Event Rate {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Max + query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="nodes/v1"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="nodes/v1"}[1m])[%v:])) + + - Identifier: KvstoreIpScopeEventRate{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Kvstore IP Scope Event Rate {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Max + query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m])[%v:])) + + - Identifier: KvstoreEndpointsScopeEventRate{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Kvstore Endpoints Scope Event Rate {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Max + query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="endpoints/v1"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="endpoints/v1"}[1m])[%v:])) + + # ----------------------------------------------------------------- + # REMOTE-CLUSTER ENDPOINT CARDINALITY. cilium_clustermesh_remote_cluster_* + # tracks per-peer state from THIS cluster's perspective. On peers + # during target's node-churn: + # - remote_cluster_nodes_total → fluctuates (target's node count + # changes) → min/max delta proves propagation reached peer + # - remote_cluster_endpoints_total → fluctuates (pod rescheduling + # during target's node churn) + # + # Spec "IP update propagation" — if the peer-side delta is zero + # while target's local kvstore events show burst, propagation is + # broken or stale. + # ----------------------------------------------------------------- + - Identifier: RemoteClusterNodesCardinality{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Remote Cluster Nodes Cardinality {{$suffix}} + metricVersion: v1 + unit: count + enableViolations: false + queries: + - name: Min + query: min(min_over_time(cilium_clustermesh_remote_cluster_nodes[%v:])) + - name: Max + query: max(max_over_time(cilium_clustermesh_remote_cluster_nodes[%v:])) + - name: Last + query: max(cilium_clustermesh_remote_cluster_nodes) + + # ----------------------------------------------------------------- + # NewNodesAppearedInWindow REMOVED 2026-05-14: build 67114 showed + # CL2's %v substitution produces a duration literal ("2309s") which + # PromQL rejects in scalar `<` comparison. The signal is redundant + # with NodeCardinality (Max - Min) above + the authoritative pre/post + # InternalIP set delta in NodeChurnTimings_*.json. diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/pod-churn-stress.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/pod-churn-stress.yaml new file mode 100644 index 0000000000..8159fd6681 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/pod-churn-stress.yaml @@ -0,0 +1,122 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +# Pod-Churn Stress Test (scale scenario #2) — slope-over-time / sustained-rate +# measurements layered on top of clustermesh-metrics.yaml. These queries +# surface the "growth over time" signals that point-in-time percentiles +# can hide: +# +# * Memory drift: positive nonzero value over a 10-minute churn window +# suggests a leak or unbounded queue. Compared head-to-head with a +# no-churn baseline run. +# * Sustained event-queue rate: max-over-time of a 1m-sliding rate. If +# this stays elevated while drift is positive, kvstore is falling +# behind the churn. +# * Remote-cluster failure rate: how fast does this monotonic counter +# accumulate under sustained churn? rate() is the counter-safe +# primitive (deriv() mishandles counter resets per the Prometheus +# docs; the rubber-duck design review caught this). + +steps: + - name: {{$action}} Pod Churn Stress Measurements + measurements: + # ----------------------------------------------------------------- + # Cilium-agent memory drift — leak detection. Two flavors: + # MaxPodDeriv: worst single agent series. Flags an outlier node. + # SumDeriv: total per-cluster memory growth across all agents. + # This is the "per-cluster footprint" number — what + # the scaling-curve dashboard uses. + # deriv() returns bytes/sec; we present as MB/s for readability. + # cilium_process_resident_memory_bytes is a gauge, so deriv() is + # well-defined (handles negative slopes correctly). + # ----------------------------------------------------------------- + - Identifier: CiliumAgentMemoryDrift{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Agent Memory Drift {{$suffix}} + metricVersion: v1 + unit: MB/s + enableViolations: false + queries: + - name: MaxPodDeriv + query: max(deriv(cilium_process_resident_memory_bytes[%v:])) / 1024 / 1024 + - name: SumDeriv + query: sum(deriv(cilium_process_resident_memory_bytes[%v:])) / 1024 / 1024 + - name: Perc50PodDeriv + query: quantile(0.50, deriv(cilium_process_resident_memory_bytes[%v:])) / 1024 / 1024 + + # ----------------------------------------------------------------- + # clustermesh-apiserver memory drift — same idea, different process. + # Uses cAdvisor's container_memory_working_set_bytes (no cilium-side + # gauge for the apiserver pod exists). Filters per the design review: + # namespace=kube-system pins to the AKS-managed Cilium deployment + # (avoid duplicate scrapes from a future + # customer-installed Cilium in another ns). + # container!="" drops cAdvisor's per-pod aggregate row + # (empty container label). + # container!="POD" drops the pause container's own series. + # ----------------------------------------------------------------- + - Identifier: ClustermeshApiserverMemoryDrift{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Apiserver Memory Drift {{$suffix}} + metricVersion: v1 + unit: MB/s + enableViolations: false + queries: + - name: MaxContainerDeriv + query: max(deriv(container_memory_working_set_bytes{namespace="kube-system",pod=~"clustermesh-apiserver-.*",container!="",container!="POD"}[%v:])) / 1024 / 1024 + - name: SumDeriv + query: sum(deriv(container_memory_working_set_bytes{namespace="kube-system",pod=~"clustermesh-apiserver-.*",container!="",container!="POD"}[%v:])) / 1024 / 1024 + + # ----------------------------------------------------------------- + # Sustained kvstore event-queue rate. The headline saturation signal + # for sustained churn — if this stays high across the run while + # MemoryDrift is positive, the system is queueing faster than it's + # draining. + # + # cilium_kvstoremesh_kvstore_events_queue_seconds_count is a counter + # (cumulative count of queued events) — must use rate(), not deriv(). + # max_over_time of a 1m-sliding rate gives "worst sustained burst" — + # spike-tolerant unlike a point sample. + # ----------------------------------------------------------------- + - Identifier: SustainedKvstoreEventRate{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Sustained Kvstore Event Rate {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Max + query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + + # ----------------------------------------------------------------- + # Remote-cluster failure rate. cilium_clustermesh_remote_cluster_failures + # is a monotonic counter — accumulated reconnect failures from this + # cluster's perspective. Under sustained churn the spec line 65 + # "missed or delayed updates" signal is whether this rate climbs + # above the baseline of ~4-6/run observed on green N=20 runs (see + # plan.md "Decisions deliberately deferred" item 6). + # + # rate() handles counter resets correctly; deriv() does not. + # ----------------------------------------------------------------- + - Identifier: RemoteClusterFailureRate{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Remote Cluster Failure Rate {{$suffix}} + metricVersion: v1 + unit: failures/s + enableViolations: false + queries: + - name: Max + query: max(max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:])) diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrole.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrole.yaml new file mode 100644 index 0000000000..df3c40e1a4 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrole.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +rules: + # Minimum verbs needed by the killer script: list to enumerate workload pods + # across namespaces, delete to terminate them, get is required by some + # kubectl operations for richer error reporting. + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "delete"] diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrolebinding.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrolebinding.yaml new file mode 100644 index 0000000000..7f36cc58b7 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrolebinding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{.RoleName}} +subjects: + - kind: ServiceAccount + name: {{.SAName}} + namespace: {{.SANamespace}} diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-job.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-job.yaml new file mode 100644 index 0000000000..4984f6f72d --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-job.yaml @@ -0,0 +1,107 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + # Never restart on failure — if the killer crashes we want loud junit + # failure, not silent retry that backoffs past the measurement window. + # backoffLimit:0 plus restartPolicy:Never together ensure exactly one + # attempt. + backoffLimit: 0 + # Job has its own deadline as a defense-in-depth bound: even if the + # in-script `while` loop never terminates for some reason, the Job + # controller kills the pod at killDuration + 60s buffer. + activeDeadlineSeconds: {{.ActiveDeadlineSeconds}} + template: + metadata: + labels: + group: {{.Group}} + app: {{.Name}} + spec: + serviceAccountName: {{.SAName}} + restartPolicy: Never + # Short grace period: the killer's signal handler exits immediately; + # nothing in the script needs to flush state. + terminationGracePeriodSeconds: 5 + containers: + - name: killer + image: {{.Image}} + # bitnami/kubectl ships kubectl + bash + coreutils (shuf, xargs, + # cut, date) which the kill loop depends on. Verified by inspection + # of telescope-upstream/modules/kustomize/fio/.../ds.yaml usage. + command: ["/bin/bash", "-c"] + args: + - | + set -o pipefail + # Graceful shutdown: SIGTERM from the Job controller (delete or + # activeDeadlineSeconds) lands here. We exit 0 so the Job is + # marked Succeeded — the rubber-duck critique called out that + # an in-flight 143 exit would mark the Job Failed and trigger + # junit error. + trap 'echo "killer: received SIGTERM, exiting"; exit 0' TERM INT + + KILL_DURATION_SECONDS="${KILL_DURATION_SECONDS:-600}" + KILL_INTERVAL_SECONDS="${KILL_INTERVAL_SECONDS:-10}" + KILL_BATCH="${KILL_BATCH:-5}" + LABEL_SELECTOR="${LABEL_SELECTOR:-group=clustermesh-pod-churn-kill}" + + echo "killer: starting (duration=${KILL_DURATION_SECONDS}s interval=${KILL_INTERVAL_SECONDS}s batch=${KILL_BATCH} selector=${LABEL_SELECTOR})" + + END_EPOCH=$(( $(date +%s) + KILL_DURATION_SECONDS )) + ROUND=0 + KILLED_TOTAL=0 + while [ "$(date +%s)" -lt "$END_EPOCH" ]; do + ROUND=$((ROUND + 1)) + # List candidate pods cluster-wide matching the label + # selector. -o name yields `pod/` per line; we strip + # the prefix and prepend the namespace via go-template. + # Random selection: shuf | head -n. shuf gracefully returns + # fewer than batch when the pool is small (mid-cycle when + # ReplicaSet has not yet replaced previous kills). + mapfile -t TARGETS < <( + kubectl get pods -A -l "$LABEL_SELECTOR" \ + -o jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' \ + | shuf | head -n "$KILL_BATCH" + ) + + if [ "${#TARGETS[@]}" -eq 0 ]; then + echo "killer: round=${ROUND} no candidates matched selector ${LABEL_SELECTOR}" + else + for nsname in "${TARGETS[@]}"; do + ns="${nsname%%/*}" + name="${nsname##*/}" + # --grace-period=0 + --force: immediate evict, no graceful + # shutdown wait. Realistic "node failure"-style event for + # the pod-event propagation path. + if kubectl delete pod -n "$ns" "$name" \ + --grace-period=0 --force --ignore-not-found \ + > /dev/null 2>&1; then + KILLED_TOTAL=$((KILLED_TOTAL + 1)) + fi + done + echo "killer: round=${ROUND} killed=${#TARGETS[@]} cumulative=${KILLED_TOTAL}" + fi + + sleep "$KILL_INTERVAL_SECONDS" + done + + echo "killer: done duration=${KILL_DURATION_SECONDS}s rounds=${ROUND} cumulative=${KILLED_TOTAL}" + exit 0 + env: + - name: KILL_DURATION_SECONDS + value: "{{.KillDurationSeconds}}" + - name: KILL_INTERVAL_SECONDS + value: "{{.KillIntervalSeconds}}" + - name: KILL_BATCH + value: "{{.KillBatch}}" + - name: LABEL_SELECTOR + value: "{{.WorkloadLabelSelector}}" + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 500m + memory: 256Mi diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-sa.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-sa.yaml new file mode 100644 index 0000000000..d56aed2810 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-sa.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{.Name}} + labels: + group: {{.Group}} diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-workload.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-workload.yaml new file mode 100644 index 0000000000..a9229e51f2 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-workload.yaml @@ -0,0 +1,52 @@ +name: clustermesh-pod-churn-workload + +# Workload module shared by both pod-churn scenarios (#2 from scale testing.txt): +# - pod-churn-scale.yaml: deterministic scale-cycle (replicas N → 0 → N → ...). +# - pod-churn-kill.yaml: in-cluster random pod deletion via a killer Job. +# +# Per the rubber-duck critique on the Phase 4a design: we KEEP the Deployment +# and Service object count constant (replicasPerNamespace = deploymentsPerNamespace +# every invocation) and ONLY vary `.spec.replicas` on the underlying Deployment +# via templateFillMap.Replicas. Setting replicasPerNamespace=0 here would DELETE +# the Deployment+Service pair, which churns service-propagation events in +# addition to pod events and changes the scenario semantics. The teardown +# scenario explicitly opts into deletion via actionName=delete (which is what +# CL2's `phases` with replicasPerNamespace=0 in the caller produces). + +{{$actionName := .actionName}} # apply | delete +{{$replicas := DefaultParam .replicas 0}} +{{$namespaces := .namespaces}} +{{$deploymentsPerNamespace := .deploymentsPerNamespace}} +{{$tuningSet := .tuningSet}} +{{$group := DefaultParam .group "clustermesh-pod-churn"}} +{{$basename := DefaultParam .basename "pc"}} + +# delete = drop objects entirely (teardown only). +# apply = keep object count constant, set Deployment .spec.replicas to $replicas. +{{$objectsPerNamespace := $deploymentsPerNamespace}} +{{if eq $actionName "delete"}}{{$objectsPerNamespace = 0}}{{end}} + +steps: + - name: {{$actionName}} pod-churn workload (replicas={{$replicas}}) + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$objectsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: {{$basename}} + objectTemplatePath: /modules/event-throughput-deployment.yaml + templateFillMap: + # Pod count per Deployment is what cycles between $replicasPerDeployment + # and 0 during the scale-cycle scenario. The Deployment object itself + # is reapplied (PATCHed) by CL2 every invocation — ReplicaSet generation + # stays stable across replica changes because .spec.template is not + # being modified (no rolling restart). + Replicas: {{$replicas}} + Group: {{$group}} + RestartGeneration: 0 + - basename: {{$basename}} + objectTemplatePath: /modules/event-throughput-service.yaml + templateFillMap: + Group: {{$group}} diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churn-combined.yaml b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-combined.yaml new file mode 100644 index 0000000000..e5649e4c73 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-combined.yaml @@ -0,0 +1,221 @@ +name: clustermesh-node-churn-combined + +# Scale scenario #3 (Node Churn / IP Churn) — combined flavor. +# +# Both spec stimuli (scale + replace) driven serially by the SAME +# host-side node-churner.sh invocation (mode=node-churn-combined), +# against the same provisioned clusters. Used for share-infra runs to +# maximize signal per expensive n=20 provision lifecycle. +# +# Sequence on the host (executed by node-churner.sh): +# 1. Wait for ready-sentinels from all clusters. +# 2. Run scale phase ($NODE_CHURN_CYCLES cycles of ±$NODE_CHURN_DELTA). +# 3. Settle $NODE_CHURN_SETTLE_SECONDS. +# 4. Run replace phase (drain + VMSS delete K instances, wait refill). +# 5. EXIT trap restores pool to original_node_count. +# +# CL2-side behavior is identical to node-churn-scale.yaml / +# node-churn-replace.yaml — workload deploy + ready-sentinel + sleep + +# gather — but with a longer sleep window equal to scale + replace +# phase walltimes summed plus settle margin. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}} + +# Default 3300s = 55min: 30min scale phase + 25min replace phase + margin. +{{$combinedDurationSeconds := DefaultParam .CL2_NODE_CHURN_COMBINED_DURATION_SECONDS 3300}} + +{{$group := "clustermesh-node-churn-combined"}} +{{$basename := "ncc"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-ncc + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-ncc" + + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/node-churn.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + - name: Start tracking node-churn-combined Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-combined + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - name: Wait for initial node-churn-combined pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-combined + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before node-churn-combined stimulus window + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + - name: Signal CL2 ready to host-side node-churner + measurements: + - Identifier: NodeChurnReadySentinel + Method: Exec + Params: + streamOutput: true + timeout: 30s + command: + - bash + - /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh + - /root/perf-tests/clusterloader2/config/sentinels + + - name: Observe node-churn-combined stimulus window + measurements: + - Identifier: NodeChurnObservationSleep + Method: Sleep + Params: + duration: {{$combinedDurationSeconds}}s + + - name: Wait for post-node-churn-combined pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-combined-final + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + - name: Final wait for pods to converge after node-churn-combined + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-combined-final + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Settle after node-churn-combined + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/node-churn.yaml + params: + action: gather + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churn-replace.yaml b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-replace.yaml new file mode 100644 index 0000000000..58a27c2cd5 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-replace.yaml @@ -0,0 +1,228 @@ +name: clustermesh-node-churn-replace + +# Scale scenario #3 (Node Churn / IP Churn) — node-replacement flavor. +# +# Spec mapping (scale testing.txt:68-79): +# * "Node replacement (new IPs)" / "Force node recreation" → this file. +# * "Node scale-up/scale-down" / "Add/remove nodes continuously" → node-churn-scale.yaml. +# +# Stimulus mechanism: host-side node-churner.sh DRAINS K nodes (via kubectl) +# then DELETES their VMSS instances (via `az vmss delete-instances`). AKS +# nodepool desired-count stays fixed (auto_scaling_enabled=false) so VMSS +# auto-replaces deleted instances with brand-new VMs that get brand-new +# private IPs. Result: K nodes effectively replaced with new identity + +# new IPs, same total count. Pre/post InternalIP snapshots in the timing +# JSON let Kusto verify the IP set actually churned. +# +# Why VMSS delete-instances rather than `az aks nodepool upgrade --node-image-only`: +# rubber-duck design review #2 — the upgrade short-circuits as a no-op +# when the node image is already current, producing zero IP churn signal. +# VMSS instance delete is mechanism-pure: deleted = gone, replacement = +# new VM with new private IP, every time. +# +# CL2-side behavior is symmetric with node-churn-scale: every cluster +# deploys workload, signals ready-sentinel, sleeps for +# CL2_NODE_CHURN_REPLACE_DURATION_SECONDS, gathers. See node-churn-scale.yaml +# for the per-step rationale. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}} + +# Default 1500s = 25min covers VMSS delete-and-refill for K=10 instances +# in parallel: each drain ≤ 5min + parallel VMSS provisioning ≤ 15min. +{{$replaceDurationSeconds := DefaultParam .CL2_NODE_CHURN_REPLACE_DURATION_SECONDS 1500}} + +{{$group := "clustermesh-node-churn-replace"}} +{{$basename := "ncr"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-ncr + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-ncr" + + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/node-churn.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + - name: Start tracking node-churn-replace Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-replace + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - name: Wait for initial node-churn-replace pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-replace + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before node-churn-replace stimulus window + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + - name: Signal CL2 ready to host-side node-churner + measurements: + - Identifier: NodeChurnReadySentinel + Method: Exec + Params: + streamOutput: true + timeout: 30s + command: + - bash + - /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh + - /root/perf-tests/clusterloader2/config/sentinels + + - name: Observe node-churn-replace stimulus window + measurements: + - Identifier: NodeChurnObservationSleep + Method: Sleep + Params: + duration: {{$replaceDurationSeconds}}s + + - name: Wait for post-node-churn-replace pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-replace-final + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + - name: Final wait for pods to converge after node-churn-replace + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-replace-final + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Settle after node-churn-replace + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/node-churn.yaml + params: + action: gather + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churn-scale.yaml b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-scale.yaml new file mode 100644 index 0000000000..62ae135801 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-scale.yaml @@ -0,0 +1,248 @@ +name: clustermesh-node-churn-scale + +# Scale scenario #3 (Node Churn / IP Churn) — scale-cycle flavor. +# +# Spec mapping (scale testing.txt:68-79): +# * "Node scale-up/scale-down" / "Add/remove nodes continuously" → this file. +# * "Node replacement (new IPs)" / "Force node recreation" → node-churn-replace.yaml. +# +# CRITICAL: the actual node-scaling stimulus is driven OUTSIDE CL2 by +# node-churner.sh (launched from steps/engine/clusterloader2/clustermesh-scale/execute.yml +# as a background subshell on the AzDO agent). Reason: the CL2 docker image +# (ghcr.io/azure/clusterloader2) has no `az` CLI and we don't control its +# build. Every cluster's CL2 just deploys a baseline pod workload, registers +# measurements, writes a ready-sentinel, then SLEEPS for +# CL2_NODE_CHURN_SCALE_DURATION_SECONDS — long enough for the churner to do +# its work + a settle window. After the sleep, gather + teardown. +# +# Per-cluster ready-sentinel: +# The "Signal ready to host churner" step writes +# /root/perf-tests/clusterloader2/config/sentinels/ready- via +# Method:Exec. The host-side node-churner.sh polls this dir for +# $cluster_count sentinels before firing its first nodepool op. Without +# this barrier, the churner could fire before peers' Prometheus is +# scraping — losing the propagation signal (rubber-duck design review #1). + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}} + +# Sleep window — must be ≥ host-side churner's expected wall time. +# Default 1800s = 30min covers 3 cycles × 2 ops × ~4min = 24min churner + +# settle margin. Per-tier overrides via matrix var +# node_churn_scale_duration_seconds (auto-exported). +{{$scaleDurationSeconds := DefaultParam .CL2_NODE_CHURN_SCALE_DURATION_SECONDS 1800}} + +{{$group := "clustermesh-node-churn-scale"}} +{{$basename := "ncs"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-ncs + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- ACNS namespace opt-in (CFP-39876) ----- + # See pod-churn-scale.yaml header for full context. Without this, + # cross-cluster identity/endpoint propagation is structurally 0. + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-ncs" + + # ----- Start measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/node-churn.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Workload deploy: pause pods spread across nodes so node churn ----- + # ----- naturally evicts a representative sample. topologySpread comes ----- + # ----- from pod-churn-workload.yaml's default Deployment shape (NOT a ----- + # ----- new module) — rubber-duck #8 noted distribution risk but the ----- + # ----- reused workload template already has it. ----- + - name: Start tracking node-churn-scale Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-scale + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - name: Wait for initial node-churn-scale pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-scale + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before node-churn stimulus window + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- Signal ready to host-side node-churner.sh ----- + # bind-mounted config dir = /root/perf-tests/clusterloader2/config in the + # CL2 container == $CL2_CONFIG_DIR on the host. The sentinels/ subdir is + # pre-created by execute.yml; we write one file per cluster named after + # the kubectl context. node-churner.sh polls for $cluster_count files + # before its first nodepool op. + - name: Signal CL2 ready to host-side node-churner + measurements: + - Identifier: NodeChurnReadySentinel + Method: Exec + Params: + streamOutput: true + timeout: 30s + command: + - bash + - /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh + - /root/perf-tests/clusterloader2/config/sentinels + + # ----- Sleep window — host-side node-churner.sh churns nodes on target ----- + # ----- cluster during this period; peers observe via measurements. ----- + - name: Observe node-churn stimulus window + measurements: + - Identifier: NodeChurnObservationSleep + Method: Sleep + Params: + duration: {{$scaleDurationSeconds}}s + + # ----- Final convergence ----- + - name: Wait for post-node-churn pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-scale-final + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + - name: Final wait for pods to converge after node-churn + measurements: + - Identifier: WaitForControlledPodsRunning-node-churn-scale-final + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Settle after node-churn + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/node-churn.yaml + params: + action: gather + + # ----- Teardown ----- + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh b/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh new file mode 100755 index 0000000000..3c00b0d96a --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh @@ -0,0 +1,1012 @@ +#!/bin/bash +# Scale scenario #3 (Node Churn / IP Churn) — drives node-level perturbation +# on the target cluster while CL2 measures across all clusters. +# +# Why this runs OUTSIDE CL2 (from execute.yml, NOT Method:Exec): +# The CL2 docker image (ghcr.io/azure/clusterloader2) has no `az` CLI and +# we don't control its build. `az` is a Python wheel with hundreds of MB +# of dependencies; pre-staging it the way we pre-stage the single-binary +# `kubectl` isn't feasible. So this script runs on the AzDO agent in a +# background subshell launched from execute.yml, in PARALLEL with the +# CL2 fanout (execute-parallel). CL2 on every cluster deploys baseline +# workload + measurements and sleeps for the scenario's duration window; +# the host-side churner drives the actual node ops; they meet again when +# execute.yml `wait`s for the churner PID after execute-parallel returns. +# +# Spec mapping (scale testing.txt:68-79): +# * "Node scale-up/scale-down" + "Add/remove nodes continuously" → SCALE +# scenario: cycle target's `default` pool count ±$DELTA for $CYCLES. +# * "Node replacement (new IPs)" + "Force node recreation" → REPLACE +# scenario: drain K nodes; `az vmss delete-instances` drops VMSS capacity +# by K; then explicitly `az aks nodepool scale --node-count $ORIGINAL` +# to refill (AKS doesn't auto-refill after delete-instances — build 67133 +# lesson). VMSS picks the next available instance IDs and provisions +# brand-new VMs with brand-new private IPs. +# * "Observe: IP update propagation, Temporary inconsistency windows" → +# pre/post node InternalIP snapshots, per-op duration, observed node +# count post-op. Peer-side propagation is captured by the parallel +# CL2 measurements (cilium / clustermesh-metrics / node-churn.yaml). +# +# Sentinel-based readiness barrier (rubber-duck design review blocker #1): +# Per-cluster CL2 writes $SENTINEL_DIR/ready- as the FIRST +# measurement step. The churner waits up to NODE_CHURN_READY_TIMEOUT_SECONDS +# for ALL $CLUSTER_COUNT sentinels before the first nodepool op, so peers +# are confirmed observing before stimulus begins. If quorum isn't reached, +# the churner aborts WITH cleanup (restore pool to original count) and +# emits scenario_valid=false so Kusto queries can drop the run. +# +# Trap-based finalizer (rubber-duck blocker #4): +# An EXIT trap unconditionally restores the target pool to original node +# count and waits for Succeeded + Ready, capped at NODE_CHURN_FINALIZER_TIMEOUT. +# If finalizer can't restore, emits cleanup_failed=true and execute.yml +# breaks out of the share-infra loop (no further scenarios run on a +# half-scaled cluster). +# +# Positional args (passed by execute.yml): +# $1 SCENARIO node-churn-{scale,replace,combined} +# $2 TARGET_CLUSTER_NAME AKS cluster name (== kubectl context) +# $3 TARGET_RESOURCE_GROUP AKS RG (same RG as `az aks show`) +# $4 TARGET_NODEPOOL workload pool name (always `default`) +# $5 REPORT_DIR absolute path; timing JSON lands here +# $6 SENTINEL_DIR absolute path; CL2 writes sentinels here +# $7 CLUSTER_COUNT expected number of ready sentinels +# $8 NODE_CHURN_CYCLES SCALE: cycles of (up+down) +# $9 NODE_CHURN_DELTA SCALE: ±N per half-cycle +# $10 NODE_CHURN_SETTLE_SECONDS sleep between ops +# $11 NODE_REPLACE_BATCH_SIZE REPLACE: # of VMSS instances to delete +# $12 NODE_CHURN_READY_TIMEOUT_SECONDS ready-sentinel poll timeout +# $13 EXPECTED_DURATION_SECONDS CL2's matching sleep window +# $14 TARGET_KUBECONFIG absolute path to target's kubeconfig +# (from $HOME/.kube/.config; passed +# explicitly so we don't have to derive +# role from target_cluster_name) +# +# Exit codes: +# 0 — always (soft-fail). The timing JSON's scenario_valid / cleanup_failed / +# per-op succeeded flags are the load-bearing signals. Exiting non-zero +# would cascade-fail the CL2 step → AzDO marks step failed → collect +# still runs (because execute.yml's share-infra loop also soft-fails) +# but the AzDO UI gets noisier than the actual data quality. + +set -uo pipefail + +SCENARIO="${1:?scenario required: node-churn-scale|node-churn-replace|node-churn-combined}" +TARGET_CLUSTER_NAME="${2:?target cluster name required}" +TARGET_RESOURCE_GROUP="${3:?target resource group required}" +TARGET_NODEPOOL="${4:-default}" +REPORT_DIR="${5:?report dir required}" +SENTINEL_DIR="${6:?sentinel dir required}" +CLUSTER_COUNT="${7:?cluster count required}" +NODE_CHURN_CYCLES="${8:-3}" +NODE_CHURN_DELTA="${9:-5}" +NODE_CHURN_SETTLE_SECONDS="${10:-60}" +NODE_REPLACE_BATCH_SIZE="${11:-10}" +NODE_CHURN_READY_TIMEOUT_SECONDS="${12:-300}" +EXPECTED_DURATION_SECONDS="${13:-1500}" +TARGET_KUBECONFIG="${14:-}" + +# Internal bounds (not exposed via positional args — fine-tuned per scenario +# class, not per matrix entry). +NODE_CHURN_OP_TIMEOUT_SECONDS=900 # per `az aks nodepool scale` op +NODE_CHURN_FINALIZER_TIMEOUT_SECONDS=900 # cleanup pool restore +NODE_REPLACE_DRAIN_TIMEOUT_SECONDS=300 # per node drain +NODE_REPLACE_WAIT_TIMEOUT_SECONDS=1500 # for kubelet Ready after refill (build 67133: bumped 1200→1500 — refill provisioning + bootstrap can take 12-15 min on a fresh VM) + +mkdir -p "$REPORT_DIR" "$SENTINEL_DIR" +TIMING_FILE="${REPORT_DIR}/NodeChurnTimings_${TARGET_CLUSTER_NAME}.json" + +log() { + echo "node-churner: $*" +} + +err() { + echo "node-churner ERROR: $*" >&2 +} + +# Resolve kubectl — prefer PATH; fall back to the pre-staged binary that +# execute.yml puts at $CL2_CONFIG_DIR/kubectl for Method:Exec scripts. The +# host AzDO agent should already have kubectl, but we don't want a brittle +# dependency on agent image version. SENTINEL_DIR is $CL2_CONFIG_DIR/sentinels +# by execute.yml's convention, so its parent is $CL2_CONFIG_DIR. +if command -v kubectl >/dev/null 2>&1; then + KUBECTL=kubectl +elif [ -x "${SENTINEL_DIR%/sentinels*}/kubectl" ]; then + KUBECTL="${SENTINEL_DIR%/sentinels*}/kubectl" + log "using pre-staged kubectl at ${KUBECTL}" +else + err "kubectl not in PATH and no pre-staged binary found at ${SENTINEL_DIR%/sentinels*}/kubectl" + KUBECTL="" +fi + +if ! command -v az >/dev/null 2>&1; then + err "az CLI not in PATH on AzDO agent — cannot run node-churn scenario; aborting" + cat > "$TIMING_FILE" </dev/null 2>&1; then + err "jq not in PATH on AzDO agent — required for timing JSON construction; aborting" + # We can't use jq for the partial JSON, but the inline heredoc above + # doesn't depend on jq. + cat > "$TIMING_FILE" < "$DEBUG_LOG" + +# State vars referenced by debug_dump — initialized early so any abort +# path (before main scenario dispatch) can call debug_dump safely under +# `set -u`. They're re-initialized to their authoritative values later +# when the scenario actually runs. +STARTED_EPOCH=$(date +%s) +READY_QUORUM_REACHED=false +SCENARIO_VALID=true +CLEANUP_FAILED=false +TRUNCATED=false +CIRCUIT_BROKEN=false +OPS_JSON='[]' +ORIGINAL_NODE_COUNT=0 +NODE_RESOURCE_GROUP="" +TARGET_VMSS="" + +debug_dump() { + local _label="$1" + { + echo "" + echo "================================================================" + echo "=== ${_label} at $(date -u +"%Y-%m-%dT%H:%M:%SZ") (epoch=$(date +%s))" + echo "================================================================" + echo "-- runtime params --" + echo "scenario=${SCENARIO} target_cluster_name=${TARGET_CLUSTER_NAME} target_rg=${TARGET_RESOURCE_GROUP}" + echo "target_nodepool=${TARGET_NODEPOOL} target_vmss=${TARGET_VMSS:-unset} NRG=${NODE_RESOURCE_GROUP:-unset}" + echo "original_node_count=${ORIGINAL_NODE_COUNT:-unset} cluster_count_quorum=${CLUSTER_COUNT}" + echo "ready_quorum_reached=${READY_QUORUM_REACHED} scenario_valid=${SCENARIO_VALID} circuit_broken=${CIRCUIT_BROKEN} cleanup_failed=${CLEANUP_FAILED} truncated=${TRUNCATED}" + echo "TARGET_KUBECONFIG=${TARGET_KUBECONFIG:-unset} KUBECTL=${KUBECTL:-unset}" + echo "" + echo "-- sentinel dir listing (${SENTINEL_DIR}) --" + ls -la "$SENTINEL_DIR" 2>&1 || echo "(ls failed)" + echo "" + echo "-- az aks nodepool show (target) --" + az aks nodepool show \ + --cluster-name "$TARGET_CLUSTER_NAME" \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_NODEPOOL" \ + --query '{count:count, provisioningState:provisioningState, powerState:powerState, vmSize:vmSize}' \ + -o json 2>&1 || echo "(az aks nodepool show failed)" + echo "" + if [ -n "${TARGET_VMSS:-}" ] && [ -n "${NODE_RESOURCE_GROUP:-}" ]; then + echo "-- az vmss show (target VMSS sku.capacity) --" + az vmss show --resource-group "$NODE_RESOURCE_GROUP" --name "$TARGET_VMSS" \ + --query '{capacity:sku.capacity, provisioningState:provisioningState}' \ + -o json 2>&1 || echo "(az vmss show failed)" + echo "" + echo "-- az vmss list-instances (count + ids) --" + az vmss list-instances --resource-group "$NODE_RESOURCE_GROUP" --name "$TARGET_VMSS" \ + --query 'length([])' -o tsv 2>&1 || echo "(az vmss list-instances failed)" + fi + echo "" + if [ -n "${KUBECTL:-}" ] && [ -n "${TARGET_KUBECONFIG:-}" ] && [ -f "$TARGET_KUBECONFIG" ]; then + echo "-- kubectl get nodes (target cluster) --" + KUBECONFIG="$TARGET_KUBECONFIG" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \ + get nodes -l "agentpool=${TARGET_NODEPOOL}" -o wide 2>&1 | head -30 || echo "(kubectl get nodes failed)" + echo "" + echo "-- target node internal IPs --" + KUBECONFIG="$TARGET_KUBECONFIG" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \ + get nodes -l "agentpool=${TARGET_NODEPOOL}" \ + -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}' 2>&1 || true + else + echo "-- kubectl skipped (no KUBECTL or kubeconfig) --" + fi + echo "" + echo "-- ops recorded so far --" + echo "$OPS_JSON" | jq -r '.[] | "op#\(.op_index) \(.op_type) succeeded=\(.succeeded) duration=\(.duration_seconds)s observed_nodes=\(.observed_node_count) error=\"\(.error)\""' 2>&1 || echo "$OPS_JSON" + echo "================================================================" + echo "" + } | tee -a "$DEBUG_LOG" +} + +# write_aborted_timing — emit a minimal timing JSON for any early-exit +# code path (az missing, jq missing, can't resolve nodepool / VMSS, etc.) +# so collect.py picks up evidence that the scenario was attempted. +write_aborted_timing() { + local _msg="$1" + local _now + _now=$(date +%s) + cat > "$TIMING_FILE" </dev/null || echo "") +if [ -z "$ORIGINAL_NODE_COUNT" ] || ! [[ "$ORIGINAL_NODE_COUNT" =~ ^[0-9]+$ ]]; then + err "could not resolve original node count for ${TARGET_CLUSTER_NAME}/${TARGET_NODEPOOL}; aborting" + write_aborted_timing "could not resolve original node count for ${TARGET_CLUSTER_NAME}/${TARGET_NODEPOOL}" + exit 0 +fi +log "original node count = ${ORIGINAL_NODE_COUNT}" + +# AKS puts VMSS in the node resource group ("MC___"). +NODE_RESOURCE_GROUP=$(az aks show \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_CLUSTER_NAME" \ + --query nodeResourceGroup -o tsv 2>/dev/null || echo "") +if [ -z "$NODE_RESOURCE_GROUP" ]; then + err "could not resolve nodeResourceGroup for ${TARGET_CLUSTER_NAME}; aborting" + write_aborted_timing "could not resolve nodeResourceGroup for ${TARGET_CLUSTER_NAME}" + exit 0 +fi + +# Discover the VMSS backing this nodepool. AKS tags VMSS with +# aks-managed-poolName=. Exactly one match expected. +TARGET_VMSS=$(az vmss list \ + --resource-group "$NODE_RESOURCE_GROUP" \ + --query "[?tags.\"aks-managed-poolName\"=='${TARGET_NODEPOOL}'].name | [0]" \ + -o tsv 2>/dev/null || echo "") +if [ -z "$TARGET_VMSS" ]; then + err "could not resolve VMSS for pool ${TARGET_NODEPOOL} in ${NODE_RESOURCE_GROUP}; aborting" + write_aborted_timing "could not resolve VMSS for pool ${TARGET_NODEPOOL} in ${NODE_RESOURCE_GROUP}" + exit 0 +fi +log "target VMSS=${TARGET_VMSS} in NRG=${NODE_RESOURCE_GROUP}" + +# ----------------------------------------------------------------------------- +# Timing-JSON accumulator. We keep state in shell vars + an ops jq array, and +# rewrite the timing file at every milestone so a crashed/SIGKILL'd run still +# leaves a partial-state file behind. +# +# Note: STARTED_EPOCH / *_FAILED / *_VALID / OPS_JSON are already initialized +# above (right after DEBUG_LOG) so debug_dump callable from any early-exit +# path. Don't re-initialize here. +# ----------------------------------------------------------------------------- + +write_timing_file() { + local _ended _dur + _ended=$(date +%s) + _dur=$(( _ended - STARTED_EPOCH )) + jq -n \ + --arg scenario "$SCENARIO" \ + --arg target_context "$TARGET_CLUSTER_NAME" \ + --arg target_cluster_name "$TARGET_CLUSTER_NAME" \ + --arg target_resource_group "$TARGET_RESOURCE_GROUP" \ + --arg target_nodepool "$TARGET_NODEPOOL" \ + --arg target_node_resource_group "$NODE_RESOURCE_GROUP" \ + --arg target_vmss "$TARGET_VMSS" \ + --argjson original_node_count "$ORIGINAL_NODE_COUNT" \ + --argjson ready_quorum_reached "$READY_QUORUM_REACHED" \ + --argjson scenario_valid "$SCENARIO_VALID" \ + --argjson cleanup_failed "$CLEANUP_FAILED" \ + --argjson truncated "$TRUNCATED" \ + --argjson started_epoch "$STARTED_EPOCH" \ + --argjson ended_epoch "$_ended" \ + --argjson duration_seconds "$_dur" \ + --argjson ops "$OPS_JSON" \ + '{scenario:$scenario, target_context:$target_context, + target_cluster_name:$target_cluster_name, + target_resource_group:$target_resource_group, + target_nodepool:$target_nodepool, + target_node_resource_group:$target_node_resource_group, + target_vmss:$target_vmss, + original_node_count:$original_node_count, + ready_quorum_reached:$ready_quorum_reached, + scenario_valid:$scenario_valid, + cleanup_failed:$cleanup_failed, + truncated:$truncated, + started_epoch:$started_epoch, + ended_epoch:$ended_epoch, + duration_seconds:$duration_seconds, + ops:$ops}' > "${TIMING_FILE}.tmp" && mv "${TIMING_FILE}.tmp" "$TIMING_FILE" +} + +# Append one op record to OPS_JSON. Args: +# $1 op_index, $2 op_type, $3 start_epoch, $4 end_epoch, +# $5 succeeded (true|false), $6 observed_node_count, +# $7 pre_state_json — JSON object {"ips":[...], "names":[...]} ('{}' = empty) +# $8 post_state_json — JSON object {"ips":[...], "names":[...]} ('{}' = empty) +# $9 error_message (empty string OK) +# +# Build 67155 lesson: pre_ip_set/post_ip_set alone is a FLAWED replacement +# signal because Azure VNet allocator immediately reuses freed private IPs +# (we deleted vmss-instance 19 at 10.1.0.19; the replacement got 10.1.0.19 +# again). Authoritative signal is NODE NAME delta (VMSS instance IDs are +# monotonic — vmss00000j → vmss00000k — not reused). jq below computes +# BOTH new_ip_count and new_node_count; downstream queries should prefer +# new_node_count for "did replacement actually happen". +record_op() { + local _idx="$1" _type="$2" _t0="$3" _t1="$4" _ok="$5" _ncount="$6" + local _pre="$7" _post="$8" _err="${9:-}" + local _dur=$(( _t1 - _t0 )) + OPS_JSON=$(jq -c \ + --argjson idx "$_idx" \ + --arg type "$_type" \ + --argjson t0 "$_t0" \ + --argjson t1 "$_t1" \ + --argjson dur "$_dur" \ + --argjson ok "$_ok" \ + --argjson ncount "$_ncount" \ + --argjson pre "$_pre" \ + --argjson post "$_post" \ + --arg err "$_err" \ + '. + [{ + op_index:$idx, op_type:$type, start_epoch:$t0, end_epoch:$t1, + duration_seconds:$dur, succeeded:$ok, observed_node_count:$ncount, + pre_ip_set: ($pre.ips // []), + post_ip_set: ($post.ips // []), + pre_node_names: ($pre.names // []), + post_node_names: ($post.names // []), + new_ip_count: ([($post.ips // [])[] | select(. as $p | (($pre.ips // []) | index($p)) | not)] | length), + new_node_count: ([($post.names // [])[] | select(. as $p | (($pre.names // []) | index($p)) | not)] | length), + error:$err + }]' \ + <<< "$OPS_JSON") + write_timing_file +} + +# Wait for VMSS provisioningState=Succeeded with timeout. Returns 0 on success, +# 1 on timeout. Polls every 10s. +wait_vmss_succeeded() { + local _timeout="${1:-$NODE_CHURN_OP_TIMEOUT_SECONDS}" + local _deadline=$(( $(date +%s) + _timeout )) + while [ "$(date +%s)" -lt "$_deadline" ]; do + local _state + _state=$(az aks nodepool show \ + --cluster-name "$TARGET_CLUSTER_NAME" \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_NODEPOOL" \ + --query provisioningState -o tsv 2>/dev/null || echo "Unknown") + if [ "$_state" = "Succeeded" ]; then + return 0 + fi + sleep 10 + done + return 1 +} + +# Resolve target kubeconfig — TARGET_KUBECONFIG (positional arg 14) is +# the authoritative path passed by execute.yml from clusters_with_kubeconfig. +# Fallbacks (legacy / robustness) below. +resolve_target_kubeconfig() { + local _kc="$TARGET_KUBECONFIG" + if [ -n "$_kc" ] && [ -f "$_kc" ]; then + echo "$_kc"; return + fi + _kc="$HOME/.kube/mesh-${TARGET_CLUSTER_NAME#clustermesh-}.config" + if [ -f "$_kc" ]; then + echo "$_kc"; return + fi + _kc="$HOME/.kube/config" + if [ -f "$_kc" ]; then + echo "$_kc"; return + fi + echo "" +} + +# Run `kubectl get nodes -o json` against the target cluster, capturing +# BOTH stdout and stderr. Logs stderr to DEBUG_LOG so we can postmortem +# failure modes (auth errors, network, label-selector drift) — build +# 67126 lost this visibility because the old kubectl invocations had +# `2>/dev/null`. +# +# Returns 0 on success and prints the JSON to stdout; returns 1 on +# kubectl failure and prints nothing. +target_kubectl_get_nodes_json() { + local _kc _out _rc + _kc=$(resolve_target_kubeconfig) + if [ -z "$_kc" ] || [ -z "$KUBECTL" ]; then + { + echo "===== kubectl get nodes: NO kubeconfig/kubectl ($(date -u +%FT%TZ)) =====" + echo "TARGET_KUBECONFIG=${TARGET_KUBECONFIG:-unset}" + echo "resolved=${_kc:-empty} KUBECTL=${KUBECTL:-empty}" + } >> "$DEBUG_LOG" + return 1 + fi + _out=$(KUBECONFIG="$_kc" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \ + get nodes -o json 2>>"$DEBUG_LOG") + _rc=$? + if [ "$_rc" -ne 0 ] || [ -z "$_out" ]; then + { + echo "===== kubectl get nodes FAILED rc=${_rc} at $(date -u +%FT%TZ) =====" + echo "kubeconfig=${_kc} context=${TARGET_CLUSTER_NAME}" + echo "(stderr appended above by 2>>)" + } >> "$DEBUG_LOG" + return 1 + fi + echo "$_out" + return 0 +} + +# Filter nodes by TARGET_VMSS providerID — robust against AKS agentpool +# label key drift (newer AKS clusters prefer kubernetes.azure.com/agentpool +# over the legacy `agentpool` key). VMSS name is unique within the cluster +# and exact-match; also implicitly excludes prompool VMSS. +# +# Emits "node_name vmss_instance_id" lines on stdout, one per matched node. +target_nodes_in_target_vmss() { + local _json + _json=$(target_kubectl_get_nodes_json) || return 1 + echo "$_json" | jq -r --arg vmss "$TARGET_VMSS" ' + .items[] + | select(.spec.providerID + | contains("/virtualMachineScaleSets/" + $vmss + "/virtualMachines/")) + | "\(.metadata.name) " + (.spec.providerID | split("/virtualMachines/")[1]) + ' 2>>"$DEBUG_LOG" +} + +# Observe current node count on target cluster from K8s side. Returns "" on +# kubectl failure — caller treats as "unknown observed count". +observe_node_count() { + local _lines + _lines=$(target_nodes_in_target_vmss) || { echo ""; return; } + echo "$_lines" | grep -c . | tr -d ' ' +} + +# Snapshot current Internal IPs AND node names for nodes in TARGET_VMSS. +# Returns a JSON object {"ips":[...], "names":[...]} on stdout. +# +# Build 67155 lesson: capture BOTH ips and names. IPs alone are unreliable +# as a replacement signal because Azure VNet allocator immediately reuses +# freed IPs. VMSS instance IDs (embedded in node names) are monotonic → +# names are the authoritative replacement signal. +# +# On kubectl failure, returns '{"ips":[],"names":[]}' (jq logic later +# handles empty arrays correctly: new_*_count == count of "post" entries). +snapshot_node_state() { + local _json + _json=$(target_kubectl_get_nodes_json) || { echo '{"ips":[],"names":[]}'; return; } + echo "$_json" | jq -c --arg vmss "$TARGET_VMSS" ' + [ .items[] + | select(.spec.providerID + | contains("/virtualMachineScaleSets/" + $vmss + "/virtualMachines/")) + ] as $matched + | { + ips: [$matched[] | .status.addresses[] | select(.type=="InternalIP") | .address], + names: [$matched[] | .metadata.name] + }' 2>>"$DEBUG_LOG" || echo '{"ips":[],"names":[]}' +} + +# Legacy compatibility shim — some call sites only need the IP set. +# New code should prefer snapshot_node_state. +snapshot_node_ips() { + snapshot_node_state | jq -c '.ips' 2>>"$DEBUG_LOG" || echo "[]" +} + +# ----------------------------------------------------------------------------- +# Finalizer — runs on EVERY exit path (trap). Idempotent. +# ----------------------------------------------------------------------------- +finalizer() { + local _exit_rc=$? + log "finalizer: starting (exit_rc=${_exit_rc}); restoring pool to original_node_count=${ORIGINAL_NODE_COUNT}" + local _current + _current=$(az aks nodepool show \ + --cluster-name "$TARGET_CLUSTER_NAME" \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_NODEPOOL" \ + --query count -o tsv 2>/dev/null || echo "$ORIGINAL_NODE_COUNT") + if [ "$_current" = "$ORIGINAL_NODE_COUNT" ]; then + log "finalizer: pool already at original_node_count; checking provisioningState" + if wait_vmss_succeeded "$NODE_CHURN_FINALIZER_TIMEOUT_SECONDS"; then + log "finalizer: pool already restored and Succeeded" + write_timing_file + return 0 + fi + log "finalizer: pool count matches but provisioningState != Succeeded; will explicitly scale to nudge reconcile" + fi + # Build 67170 lesson: prior scale ops may have failed mid-scenario while + # AKS was still Updating. Wait for Succeeded before issuing the explicit + # scale-back-to-original — otherwise this scale fails with the SAME + # OperationNotAllowed error and cleanup_failed=true cascades incorrectly. + if ! wait_vmss_succeeded "$NODE_CHURN_FINALIZER_TIMEOUT_SECONDS"; then + err "finalizer: provisioningState never reached Succeeded within ${NODE_CHURN_FINALIZER_TIMEOUT_SECONDS}s; cannot proceed with restore" + CLEANUP_FAILED=true + debug_dump "FINALIZER cleanup_failed (waited for Succeeded; never got there)" + write_timing_file + return 1 + fi + # Stderr captured to debug log (build 67170 lesson: the prior >/dev/null + # 2>&1 swallowed the real error message; we ended up guessing). + if ! az aks nodepool scale \ + --cluster-name "$TARGET_CLUSTER_NAME" \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_NODEPOOL" \ + --node-count "$ORIGINAL_NODE_COUNT" \ + --no-wait --only-show-errors 2>/tmp/node-churner-finalizer.err; then + local _finalizer_err + _finalizer_err=$(tr '\n' ' ' < /tmp/node-churner-finalizer.err | head -c 500) + err "finalizer: az aks nodepool scale to ${ORIGINAL_NODE_COUNT} failed: ${_finalizer_err}" + echo "===== finalizer az error =====" >> "$DEBUG_LOG" + cat /tmp/node-churner-finalizer.err >> "$DEBUG_LOG" + echo "===== end finalizer az error =====" >> "$DEBUG_LOG" + CLEANUP_FAILED=true + debug_dump "FINALIZER cleanup_failed (az aks nodepool scale to original failed)" + write_timing_file + return 1 + fi + if ! wait_vmss_succeeded "$NODE_CHURN_FINALIZER_TIMEOUT_SECONDS"; then + err "finalizer: pool did NOT reach Succeeded within ${NODE_CHURN_FINALIZER_TIMEOUT_SECONDS}s" + CLEANUP_FAILED=true + debug_dump "FINALIZER cleanup_failed (provisioningState != Succeeded)" + write_timing_file + return 1 + fi + log "finalizer: pool restored to ${ORIGINAL_NODE_COUNT}, Succeeded" + write_timing_file + return 0 +} +trap finalizer EXIT + +# Initial state — write the file so even an early abort leaves a row. +write_timing_file + +# ----------------------------------------------------------------------------- +# Ready-sentinel barrier +# ----------------------------------------------------------------------------- +log "ready-barrier: waiting for ${CLUSTER_COUNT} CL2 sentinel(s) in ${SENTINEL_DIR}" +BARRIER_DEADLINE=$(( $(date +%s) + NODE_CHURN_READY_TIMEOUT_SECONDS )) +while [ "$(date +%s)" -lt "$BARRIER_DEADLINE" ]; do + _count=$(find "$SENTINEL_DIR" -maxdepth 1 -name 'ready-*' -type f 2>/dev/null | wc -l | tr -d ' ') + if [ "$_count" -ge "$CLUSTER_COUNT" ]; then + log "ready-barrier: quorum reached (${_count}/${CLUSTER_COUNT})" + READY_QUORUM_REACHED=true + write_timing_file + break + fi + sleep 5 +done +if [ "$READY_QUORUM_REACHED" != true ]; then + err "ready-barrier: quorum NOT reached after ${NODE_CHURN_READY_TIMEOUT_SECONDS}s (saw ${_count:-0}/${CLUSTER_COUNT}); aborting scenario" + SCENARIO_VALID=false + debug_dump "READY-BARRIER ABORT (saw ${_count:-0}/${CLUSTER_COUNT})" + write_timing_file + exit 0 +fi + +# ----------------------------------------------------------------------------- +# Scenario dispatch +# ----------------------------------------------------------------------------- +OP_INDEX=0 +WALL_DEADLINE=$(( STARTED_EPOCH + EXPECTED_DURATION_SECONDS )) + +run_scale_phase() { + log "scale phase: ${NODE_CHURN_CYCLES} cycles × (up by ${NODE_CHURN_DELTA}, down by ${NODE_CHURN_DELTA})" + local _cur="$ORIGINAL_NODE_COUNT" + for _c in $(seq 1 "$NODE_CHURN_CYCLES"); do + # Circuit breaker — stop if a previous op tripped it. + if [ "$CIRCUIT_BROKEN" = true ]; then + log "scale phase: circuit broken; skipping remaining cycles" + break + fi + # ---- scale UP ---- + local _target=$(( _cur + NODE_CHURN_DELTA )) + OP_INDEX=$(( OP_INDEX + 1 )) + log "cycle ${_c}/${NODE_CHURN_CYCLES} op#${OP_INDEX} scale_up: ${_cur} → ${_target}" + # Build 67170 lesson: `az aks nodepool scale` returns sync to the CLI + # but the underlying managed-cluster RP operation continues async. + # Issuing the next nodepool scale while provisioningState=Updating + # triggers OperationNotAllowed. Always wait for Succeeded first. + if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then + err "scale phase: provisioningState != Succeeded before scale_up op#${OP_INDEX}; aborting cycle" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "PRE-OP wait_vmss_succeeded timeout before scale_up op#${OP_INDEX}" + break + fi + local _pre_state + _pre_state=$(snapshot_node_state) + local _t0=$(date +%s) + local _err="" + local _ok=true + if ! az aks nodepool scale \ + --cluster-name "$TARGET_CLUSTER_NAME" \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_NODEPOOL" \ + --node-count "$_target" \ + --only-show-errors 2>/tmp/node-churner-az.err; then + _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500) + _ok=false + # OperationNotAllowed / throttling — structural error, trip circuit breaker. + if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then + err "scale phase: structural Azure RP error on scale_up; tripping circuit breaker" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "CIRCUIT-BROKEN on scale_up op#${OP_INDEX} (Azure RP structural error)" + fi + fi + local _t1=$(date +%s) + local _ncount + _ncount=$(observe_node_count) + [ -z "$_ncount" ] && _ncount=0 + local _post_state + _post_state=$(snapshot_node_state) + record_op "$OP_INDEX" "scale_up" "$_t0" "$_t1" "$_ok" "$_ncount" "$_pre_state" "$_post_state" "$_err" + [ "$_ok" = true ] && _cur="$_target" + sleep "$NODE_CHURN_SETTLE_SECONDS" + + if [ "$CIRCUIT_BROKEN" = true ]; then + break + fi + # ---- scale DOWN ---- + _target=$(( _cur - NODE_CHURN_DELTA )) + if [ "$_target" -lt 1 ]; then _target=1; fi + OP_INDEX=$(( OP_INDEX + 1 )) + log "cycle ${_c}/${NODE_CHURN_CYCLES} op#${OP_INDEX} scale_down: ${_cur} → ${_target}" + if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then + err "scale phase: provisioningState != Succeeded before scale_down op#${OP_INDEX}; aborting cycle" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "PRE-OP wait_vmss_succeeded timeout before scale_down op#${OP_INDEX}" + break + fi + _pre_state=$(snapshot_node_state) + _t0=$(date +%s) + _err="" + _ok=true + if ! az aks nodepool scale \ + --cluster-name "$TARGET_CLUSTER_NAME" \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_NODEPOOL" \ + --node-count "$_target" \ + --only-show-errors 2>/tmp/node-churner-az.err; then + _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500) + _ok=false + if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then + err "scale phase: structural Azure RP error on scale_down; tripping circuit breaker" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "CIRCUIT-BROKEN on scale_down op#${OP_INDEX} (Azure RP structural error)" + fi + fi + _t1=$(date +%s) + _ncount=$(observe_node_count) + [ -z "$_ncount" ] && _ncount=0 + _post_state=$(snapshot_node_state) + record_op "$OP_INDEX" "scale_down" "$_t0" "$_t1" "$_ok" "$_ncount" "$_pre_state" "$_post_state" "$_err" + [ "$_ok" = true ] && _cur="$_target" + sleep "$NODE_CHURN_SETTLE_SECONDS" + done + log "scale phase: complete (ended at cycle current_count=${_cur})" +} + +run_replace_phase() { + log "replace phase: drain + delete ${NODE_REPLACE_BATCH_SIZE} VMSS instance(s); AKS auto-refills" + if [ -z "$KUBECTL" ]; then + err "replace phase: kubectl unavailable; skipping (cannot drain)" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "REPLACE-PHASE aborted (KUBECTL unset)" + return + fi + + # ---- 1. Pre-snapshot state (IPs + node names) + pick K nodes ---- + # Both ips AND names are recorded so post-run analysis can use whichever + # signal is appropriate. Build 67155 showed IPs are unreliable (Azure + # reuses freed private IPs); node names (VMSS instance suffix) are the + # authoritative replacement marker. + local _pre_state + _pre_state=$(snapshot_node_state) + local _kubeconfig + _kubeconfig=$(resolve_target_kubeconfig) + if [ -z "$_kubeconfig" ]; then + err "replace phase: could not resolve a usable kubeconfig path; aborting" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "REPLACE-PHASE aborted (no usable kubeconfig)" + return + fi + + # Pick K target VMSS instance ids via the VMSS-providerID filter + # (label-key independent, build 67126 lesson). + local _node_iid_lines + _node_iid_lines=$(target_nodes_in_target_vmss) + if [ -z "$_node_iid_lines" ]; then + err "replace phase: 0 nodes match VMSS=${TARGET_VMSS}; aborting" + # Dump raw kubectl output so postmortem can see WHY (label drift, + # providerID format change, auth blip). + { + echo "===== REPLACE-PHASE no-nodes diagnostic =====" + echo "expected VMSS=${TARGET_VMSS}" + echo "kubeconfig=${_kubeconfig}" + echo "-- kubectl get nodes -o wide (raw, no label filter) --" + KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \ + get nodes -o wide 2>&1 | head -50 || true + echo "-- kubectl get nodes -o jsonpath providerID dump --" + KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \ + get nodes -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.spec.providerID}{"\n"}{end}' 2>&1 \ + | head -50 || true + } >> "$DEBUG_LOG" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "REPLACE-PHASE aborted (0 nodes match VMSS=${TARGET_VMSS})" + return + fi + + # Shuffle and take first K. + local _selected + if command -v shuf >/dev/null 2>&1; then + _selected=$(echo "$_node_iid_lines" | shuf | head -n "$NODE_REPLACE_BATCH_SIZE") + else + _selected=$(echo "$_node_iid_lines" \ + | awk 'BEGIN{srand()} {print rand()" "$0}' \ + | sort -k1,1n | head -n "$NODE_REPLACE_BATCH_SIZE" | cut -d" " -f2-) + fi + local _selected_count + _selected_count=$(echo "$_selected" | wc -l | tr -d ' ') + log "replace phase: selected ${_selected_count} nodes for replacement" + echo "$_selected" | awk '{print " - "$1" (vmss-instance "$2")"}' + + # ---- 2. Drain selected nodes (one Op record per drain) ---- + local _instance_ids_csv="" + while IFS= read -r _line; do + [ -z "$_line" ] && continue + local _node_name="${_line%% *}" + local _instance_id="${_line##* }" + OP_INDEX=$(( OP_INDEX + 1 )) + log "op#${OP_INDEX} replace_drain: ${_node_name} (vmss-instance ${_instance_id})" + local _t0=$(date +%s) + local _err="" + local _ok=true + # Cordon first (idempotent + cheap), then drain. timeout caps per-node + # so a stuck PDB doesn't block the whole batch. + KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \ + cordon "$_node_name" >/dev/null 2>&1 || true + if ! KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \ + drain "$_node_name" --ignore-daemonsets --delete-emptydir-data --force \ + --grace-period=30 \ + --timeout="${NODE_REPLACE_DRAIN_TIMEOUT_SECONDS}s" 2>/tmp/node-churner-drain.err; then + _err=$(tr '\n' ' ' < /tmp/node-churner-drain.err | head -c 500) + _ok=false + # Drain failure isn't fatal — AKS will still drain the node when we + # delete the VMSS instance underneath. Record and continue. + log "replace phase: drain ${_node_name} returned non-zero; continuing (VMSS delete will force)" + fi + local _t1=$(date +%s) + record_op "$OP_INDEX" "replace_drain" "$_t0" "$_t1" "$_ok" 0 '{}' '{}' "$_err" + if [ -n "$_instance_ids_csv" ]; then + _instance_ids_csv="${_instance_ids_csv} ${_instance_id}" + else + _instance_ids_csv="${_instance_id}" + fi + done <<< "$_selected" + + if [ "$CIRCUIT_BROKEN" = true ]; then + log "replace phase: circuit broken before VMSS delete" + return + fi + if [ -z "$_instance_ids_csv" ]; then + err "replace phase: no instance IDs collected; aborting" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "REPLACE-PHASE aborted (no instance ids after drain loop)" + return + fi + + # ---- 3. Delete selected VMSS instances in a single batched call ---- + OP_INDEX=$(( OP_INDEX + 1 )) + log "op#${OP_INDEX} replace_delete: deleting VMSS instances [${_instance_ids_csv}]" + # Wait for AKS to settle before issuing the next RP op (build 67170 race fix). + if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then + err "replace phase: provisioningState != Succeeded before replace_delete; tripping circuit breaker" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "PRE-OP wait_vmss_succeeded timeout before replace_delete op#${OP_INDEX}" + return + fi + local _t0=$(date +%s) + local _err="" + local _ok=true + # shellcheck disable=SC2086 # word splitting intentional for instance ids + if ! az vmss delete-instances \ + --resource-group "$NODE_RESOURCE_GROUP" \ + --name "$TARGET_VMSS" \ + --instance-ids ${_instance_ids_csv} \ + --only-show-errors 2>/tmp/node-churner-az.err; then + _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500) + _ok=false + if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then + err "replace phase: structural Azure RP error on vmss delete-instances; tripping circuit breaker" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "CIRCUIT-BROKEN on replace_delete op#${OP_INDEX} (Azure RP structural error)" + fi + fi + local _t1=$(date +%s) + local _ncount + _ncount=$(observe_node_count) + [ -z "$_ncount" ] && _ncount=0 + record_op "$OP_INDEX" "replace_delete" "$_t0" "$_t1" "$_ok" "$_ncount" '{}' '{}' "$_err" + + if [ "$CIRCUIT_BROKEN" = true ]; then return; fi + + # ---- 4. Explicit refill via AKS nodepool scale ---- + # Build 67133 lesson: `az vmss delete-instances` drops VMSS capacity by K, + # and AKS observes the drop (nodepool count goes from N to N-K) but does + # NOT auto-refill back to N. The finalizer's `az aks nodepool scale + # --node-count $ORIGINAL` succeeded → so the explicit re-scale IS the + # correct primitive. Run it here as a dedicated op so the timing JSON + # records the refill latency separately from the kubelet-Ready wait. + # + # AKS-side refill picks up the next available VMSS instance ID and + # provisions a brand-new VM with a brand-new InternalIP — exactly the + # IP-churn signal the spec asks for. + OP_INDEX=$(( OP_INDEX + 1 )) + log "op#${OP_INDEX} replace_refill: az aks nodepool scale → ${ORIGINAL_NODE_COUNT} (re-add ${NODE_REPLACE_BATCH_SIZE} replacement(s))" + if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then + err "replace phase: provisioningState != Succeeded before replace_refill; tripping circuit breaker" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "PRE-OP wait_vmss_succeeded timeout before replace_refill op#${OP_INDEX}" + return + fi + _t0=$(date +%s) + _err="" + _ok=true + if ! az aks nodepool scale \ + --cluster-name "$TARGET_CLUSTER_NAME" \ + --resource-group "$TARGET_RESOURCE_GROUP" \ + --name "$TARGET_NODEPOOL" \ + --node-count "$ORIGINAL_NODE_COUNT" \ + --only-show-errors 2>/tmp/node-churner-az.err; then + _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500) + _ok=false + if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then + err "replace phase: structural Azure RP error on replace_refill; tripping circuit breaker" + CIRCUIT_BROKEN=true + SCENARIO_VALID=false + debug_dump "CIRCUIT-BROKEN on replace_refill op#${OP_INDEX} (Azure RP structural error)" + fi + fi + _t1=$(date +%s) + _ncount=$(observe_node_count) + [ -z "$_ncount" ] && _ncount=0 + record_op "$OP_INDEX" "replace_refill" "$_t0" "$_t1" "$_ok" "$_ncount" '{}' '{}' "$_err" + + if [ "$CIRCUIT_BROKEN" = true ]; then return; fi + + # ---- 5. Wait for K8s Ready node count to return to ORIGINAL ---- + # AKS nodepool scale returns when Azure provisioning is complete, but + # kubelet on the new VM still needs to register + reach Ready. Poll + # kubectl until Ready count == ORIGINAL (not just VMSS provisioningState). + OP_INDEX=$(( OP_INDEX + 1 )) + log "op#${OP_INDEX} replace_wait: waiting for ${ORIGINAL_NODE_COUNT} Ready nodes in pool" + _t0=$(date +%s) + _err="" + _ok=false + local _wait_deadline=$(( _t0 + NODE_REPLACE_WAIT_TIMEOUT_SECONDS )) + local _ready_count=0 + while [ "$(date +%s)" -lt "$_wait_deadline" ]; do + # Count Ready nodes whose providerID is in our target VMSS (label- + # selector-agnostic; build 67126 regression fix). + local _ready_json + _ready_json=$(target_kubectl_get_nodes_json 2>/dev/null) + if [ -n "$_ready_json" ]; then + _ready_count=$(echo "$_ready_json" | jq -r --arg vmss "$TARGET_VMSS" ' + [ .items[] + | select(.spec.providerID | contains("/virtualMachineScaleSets/" + $vmss + "/virtualMachines/")) + | .status.conditions[] + | select(.type=="Ready" and .status=="True") ] | length' 2>/dev/null || echo 0) + else + _ready_count=0 + fi + if [ "$_ready_count" -ge "$ORIGINAL_NODE_COUNT" ]; then + _ok=true + break + fi + sleep 10 + done + _t1=$(date +%s) + local _post_state + _post_state=$(snapshot_node_state) + if [ "$_ok" != true ]; then + _err="replace_wait: timeout after ${NODE_REPLACE_WAIT_TIMEOUT_SECONDS}s; ready=${_ready_count}/${ORIGINAL_NODE_COUNT}" + err "$_err" + SCENARIO_VALID=false + debug_dump "REPLACE_WAIT timeout (ready=${_ready_count}/${ORIGINAL_NODE_COUNT})" + fi + record_op "$OP_INDEX" "replace_wait" "$_t0" "$_t1" "$_ok" "$_ready_count" "$_pre_state" "$_post_state" "$_err" + # Pull new_node_count from the just-recorded op for the summary log line. + local _new_node_count _new_ip_count + _new_node_count=$(echo "$OPS_JSON" | jq -r '.[-1].new_node_count') + _new_ip_count=$(echo "$OPS_JSON" | jq -r '.[-1].new_ip_count') + log "replace phase: complete (new_node_count=${_new_node_count} [authoritative], new_ip_count=${_new_ip_count} [informational; Azure may reuse freed IPs])" +} + +case "$SCENARIO" in + node-churn-scale) + run_scale_phase + ;; + node-churn-replace) + run_replace_phase + ;; + node-churn-combined) + run_scale_phase + if [ "$CIRCUIT_BROKEN" != true ]; then + log "transitioning from scale phase to replace phase" + sleep "$NODE_CHURN_SETTLE_SECONDS" + run_replace_phase + else + log "scale phase circuit-broken; skipping replace phase" + fi + ;; + *) + err "unknown scenario '${SCENARIO}'; expected node-churn-{scale,replace,combined}" + SCENARIO_VALID=false + ;; +esac + +# Truncation check: did we run past CL2's sleep window? +if [ "$(date +%s)" -gt "$WALL_DEADLINE" ]; then + log "WARN: churner ran past CL2 sleep window (${EXPECTED_DURATION_SECONDS}s); peer measurements may be truncated" + TRUNCATED=true +fi + +write_timing_file +log "scenario complete; finalizer will run via EXIT trap" +exit 0 diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-combined.yaml b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-combined.yaml new file mode 100644 index 0000000000..7b4a1f8ea1 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-combined.yaml @@ -0,0 +1,330 @@ +name: clustermesh-pod-churn-combined + +# Combined Phase 4a config — single CL2 invocation runs scale-cycle then +# kill against the SAME workload deployment. Goal: extract maximum signal +# per (expensive) n20 provision/destroy lifecycle by exercising both +# stressor flavors of Scenario #2 back-to-back. +# +# Sequence: +# 1. Start measurements (control-plane, cilium, clustermesh-{metrics, +# throughput}, etcd-metrics, pod-churn-stress). +# 2. Deploy PodMonitor. +# 3. Create workload at full replicas + WaitForControlledPodsRunning gate. +# 4. PHASE A — Scale-cycle stress (deterministic): +# $churnCycles iterations of (scale-down 0 → sleep down → scale-up N +# → sleep up). No per-cycle wait; let it churn freely. +# 5. Intermediate WaitForControlledPodsRunning gather + brief settle. +# 6. PHASE B — Kill stress (stochastic): Method: Exec runs +# pod-churn-killer.sh inside the CL2 docker container, deleting +# $killBatch random workload pods every $killIntervalSeconds for +# $killDurationSeconds. ReplicaSet re-creates them, driving the +# failure-driven event path. If kubectl is unavailable in the CL2 +# image (Method: Exec dependency), this measurement returns 127 and +# CL2 marks it failed but the surrounding settle/gather/teardown +# steps still run, preserving Phase A scale-cycle data. +# 7. Final WaitForControlledPodsRunning gather + settle. +# 8. Gather measurements (all modules above). +# 9. Teardown (workload + PodMonitor). +# +# Knob values come from the same CL2_* overrides scale.py writes for the +# split scale/kill scenarios, so the existing matrix-var plumbing in +# steps/engine/clusterloader2/clustermesh-scale/execute.yml works without +# modification. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}} +{{$churnCycles := DefaultParam .CL2_CHURN_CYCLES 5}} +{{$churnUpDuration := DefaultParam .CL2_CHURN_UP_DURATION "60s"}} +{{$churnDownDuration := DefaultParam .CL2_CHURN_DOWN_DURATION "60s"}} +{{$killDurationSeconds := DefaultParam .CL2_KILL_DURATION_SECONDS 600}} +{{$killIntervalSeconds := DefaultParam .CL2_KILL_INTERVAL_SECONDS 10}} +{{$killBatch := DefaultParam .CL2_KILL_BATCH 5}} +# Method: Exec timeout — must exceed kill duration with margin so the +# loop's deadline check fires before this hard cap. Set to 1.5x kill +# duration as defense-in-depth. +{{$killExecTimeout := DefaultParam .CL2_KILL_EXEC_TIMEOUT "15m"}} + +{{$workloadGroup := "clustermesh-pod-churn-combined"}} +{{$workloadBasename := "pcc"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-pcc + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- ACNS namespace opt-in (CFP-39876) ----- + # AKS-managed Cilium ships with clustermesh-default-global-namespace=false, + # so workload namespaces need clustermesh.cilium.io/global=true to sync + # their CiliumIdentity/Endpoint/Services across the mesh. Without this, + # cross-cluster propagation metrics (e.g. cilium_clustermesh_global_services) + # are structurally 0 regardless of pod churn. See plan.md note #14 + ACNS + # team confirmation 2026-05-11 (David Vadas / Isaiah Raya). Runs FIRST so + # the annotation is in place before any CiliumIdentity / Endpoint forms. + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-pcc" + + # ----- Start measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Workload deploy + initial settle ----- + - name: Start tracking pod-churn-combined Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-combined-initial + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Wait for initial pod-churn-combined pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-combined-initial + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before phase A + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- PHASE A: scale-cycle stress ----- + - name: Start tracking pod-churn scale-cycle phase + measurements: + - Identifier: WaitForControlledPodsRunning-phase-a + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + {{range $i := Loop $churnCycles}} + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Phase A cycle {{$i}} — down hold + measurements: + - Identifier: PhaseADownSleep-{{$i}} + Method: Sleep + Params: + duration: {{$churnDownDuration}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Phase A cycle {{$i}} — up hold + measurements: + - Identifier: PhaseAUpSleep-{{$i}} + Method: Sleep + Params: + duration: {{$churnUpDuration}} + {{end}} + + - name: Wait for post-scale-cycle pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-phase-a + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Brief settle between Phase A and Phase B + measurements: + - Identifier: InterPhaseSleep + Method: Sleep + Params: + duration: 30s + + # ----- PHASE B: kill stress via Method: Exec ----- + # Method: Exec runs the killer script inside the CL2 docker container. + # The container has /root/.kube/config (the per-cluster kubeconfig) + # mounted by run_cl2_command. The script uses kubectl from $PATH in + # the CL2 image; if missing it exits 127, this measurement is marked + # failed, but subsequent steps (settle, gather, teardown) still run. + - name: Phase B pod-churn kill loop + measurements: + - Identifier: PodChurnKillLoop + Method: Exec + Params: + streamOutput: true + timeout: {{$killExecTimeout}} + command: + - bash + - /root/perf-tests/clusterloader2/config/pod-churn-killer.sh + - "{{$killDurationSeconds}}" + - "{{$killIntervalSeconds}}" + - "{{$killBatch}}" + - "{{$workloadGroup}}" + + # ----- Final convergence ----- + - name: Start tracking post-kill convergence + measurements: + - Identifier: WaitForControlledPodsRunning-post-combined + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + - name: Wait for post-kill pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-post-combined + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Settle after combined churn + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: gather + + # ----- Teardown ----- + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-kill.yaml b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-kill.yaml new file mode 100644 index 0000000000..7055652793 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-kill.yaml @@ -0,0 +1,308 @@ +name: clustermesh-pod-churn-kill + +# Scale scenario #2 (Pod Churn Stress Test) — random pod kill variant. +# +# Spec (scale testing.txt line 64): "Kill pods at random intervals." +# +# This complements pod-churn-scale.yaml: instead of cycling Deployment .spec.replicas +# (deterministic, controller-driven churn), we deploy an in-cluster killer Job +# that picks $killBatch random pods every $killInterval and force-deletes them. +# The ReplicaSet immediately re-creates them, exercising the failure-driven +# event path. Both halves of scenario #2 produce overlapping but +# distinguishable mesh signals: scale-cycle is steady-state, predictable; +# kill is bursty, ReplicaSet-driven. +# +# Killer Job runs for ${killDuration}s then exits 0 cleanly. The Job's +# activeDeadlineSeconds is set to killDuration + 60s buffer as a defense-in-depth +# bound. WaitForFinishedJobs gathers the completion signal — no explicit +# delete-and-wait dance. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}} +{{$killDuration := DefaultParam .CL2_KILL_DURATION "10m"}} +{{$killIntervalSeconds := DefaultParam .CL2_KILL_INTERVAL_SECONDS 10}} +{{$killBatch := DefaultParam .CL2_KILL_BATCH 5}} +{{$killDurationSeconds := DefaultParam .CL2_KILL_DURATION_SECONDS 600}} +{{$jobDeadlineSeconds := DefaultParam .CL2_KILL_JOB_DEADLINE_SECONDS 660}} +# Hard-coded — repeated below for the killer's --label-selector and the +# workload's group label. Keep these in sync. +{{$workloadGroup := "clustermesh-pod-churn-kill"}} +{{$killerGroup := "clustermesh-pod-churn-killer"}} +{{$workloadBasename := "pck"}} +# bitnami/kubectl image already trusted in this repo (modules/kustomize/fio/.../ds.yaml). +# Ships bash + shuf + xargs + cut + kubectl which the killer script depends on. +{{$killerImage := DefaultParam .CL2_KILLER_IMAGE "telescope.azurecr.io/bitnami/kubectl:v1.33.2"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-pck + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- ACNS namespace opt-in (CFP-39876) ----- + # AKS-managed Cilium ships with clustermesh-default-global-namespace=false, + # so workload namespaces need clustermesh.cilium.io/global=true to sync + # their CiliumIdentity/Endpoint/Services across the mesh. Without this, + # cross-cluster propagation metrics are structurally 0. See plan.md + # note #14 + ACNS team confirmation 2026-05-11. + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-pck" + + # ----- Start measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Workload deploy + initial settle ----- + - name: Start tracking pod-churn-kill Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-kill + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Wait for initial pod-churn-kill pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-kill + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Warmup before kill + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- Killer deploy ----- + # Distinct basenames per kind so the binding's RoleName/SAName references + # are unambiguous and don't depend on CL2's cross-kind name-collision + # behavior. All four objects share namespace `default` (universal), + # replicasPerNamespace: 1. + - name: Register WaitForFinishedJobs for killer + measurements: + - Identifier: WaitForFinishedJobs-killer + Method: WaitForFinishedJobs + Params: + action: start + labelSelector: group={{$killerGroup}} + # Killer's activeDeadlineSeconds bounds the Job's lifetime; + # this WaitForFinishedJobs timeout has to exceed that with margin + # so the gather doesn't time out while the killer is still inside + # its grace period. + timeout: {{$operationTimeout}} + + - name: Deploy pod-churn killer + phases: + - namespaceList: ["default"] + replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: pck-sa + objectTemplatePath: /modules/pod-churn-killer-sa.yaml + templateFillMap: + Group: {{$killerGroup}} + - basename: pck-cr + objectTemplatePath: /modules/pod-churn-killer-clusterrole.yaml + templateFillMap: + Group: {{$killerGroup}} + - basename: pck-crb + objectTemplatePath: /modules/pod-churn-killer-clusterrolebinding.yaml + templateFillMap: + Group: {{$killerGroup}} + RoleName: pck-cr-1 + SAName: pck-sa-1 + SANamespace: default + - basename: pck-job + objectTemplatePath: /modules/pod-churn-killer-job.yaml + templateFillMap: + Group: {{$killerGroup}} + SAName: pck-sa-1 + Image: {{$killerImage}} + ActiveDeadlineSeconds: {{$jobDeadlineSeconds}} + KillDurationSeconds: {{$killDurationSeconds}} + KillIntervalSeconds: {{$killIntervalSeconds}} + KillBatch: {{$killBatch}} + WorkloadLabelSelector: group={{$workloadGroup}} + + # ----- Wait for the killer to finish its own time-bounded run ----- + # WaitForFinishedJobs blocks until the killer pod's status is Succeeded + # (clean exit 0 on deadline) or Failed (image pull error / RBAC denial / + # script crash). Either way, control returns here and we proceed to + # final reconciliation. We don't explicitly delete the Job — the + # Sleep + WaitForFinishedJobs is the gate. + - name: Wait for killer Job to complete + measurements: + - Identifier: WaitForFinishedJobs-killer + Method: WaitForFinishedJobs + Params: + action: gather + + # ----- Re-register a fresh watcher for the post-kill convergence so the + # final gather only reflects pod reconciliation after the killer stopped. ----- + - name: Start tracking post-kill convergence + measurements: + - Identifier: WaitForControlledPodsRunning-post-kill + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$workloadGroup}} + operationTimeout: {{$operationTimeout}} + + - name: Wait for post-kill pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-post-kill + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Settle after kill + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$workloadGroup}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: gather + + # ----- Teardown: workload + killer (SA/CR/CRB/Job objects). ----- + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$workloadGroup}} + basename: {{$workloadBasename}} + + - name: Tear down killer resources + phases: + - namespaceList: ["default"] + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: pck-sa + objectTemplatePath: /modules/pod-churn-killer-sa.yaml + - basename: pck-cr + objectTemplatePath: /modules/pod-churn-killer-clusterrole.yaml + - basename: pck-crb + objectTemplatePath: /modules/pod-churn-killer-clusterrolebinding.yaml + - basename: pck-job + objectTemplatePath: /modules/pod-churn-killer-job.yaml + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-killer.sh b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-killer.sh new file mode 100755 index 0000000000..2268f8e126 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-killer.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# Pod-churn killer loop — runs from inside the CL2 docker container +# (invoked via Method: Exec from pod-churn-combined.yaml). +# +# Why this lives here instead of as an in-cluster Job: the in-cluster Job +# approach requires pulling a kubectl image (e.g. bitnami/kubectl) onto +# every AKS cluster, which needs AcrPull or a public-registry-friendly +# CSSC-compliant image — neither is currently configured in the +# clustermesh-scale tfvars. The CL2 container already has the kubeconfig +# mounted at /root/.kube/config and (per Telescope's +# job_controller/config/ray/config.yaml precedent) supports `Method: Exec` +# with `bash`. We run kubectl from here against the same kubeconfig CL2 +# uses — no extra image pull, no extra RBAC. Plan 4a runs this against +# one cluster per per-cluster CL2 instance (execute-parallel handles +# fan-out). +# +# Positional args (passed via Method: Exec command list): +# $1 KILL_DURATION_SECONDS Total runtime in seconds. +# $2 KILL_INTERVAL_SECONDS Seconds between successive kill rounds. +# $3 KILL_BATCH Pods deleted per round. +# $4 WORKLOAD_GROUP Label-selector group value. +# +# Exits 0 on successful completion of the time-bounded loop. Exits 127 +# if kubectl is unavailable in this CL2 image (Method: Exec marks the +# measurement failed; the surrounding combined.yaml still completes the +# settle + gather steps so scale-phase data is preserved). + +set -u +set -o pipefail + +KILL_DURATION_SECONDS="${1:-600}" +KILL_INTERVAL_SECONDS="${2:-10}" +KILL_BATCH="${3:-5}" +WORKLOAD_GROUP="${4:-clustermesh-pod-churn}" +LABEL_SELECTOR="group=${WORKLOAD_GROUP}" + +if ! command -v kubectl >/dev/null 2>&1; then + # Fallback: the pipeline's execute.yml pre-stages kubectl into the + # cl2_config_dir (which is bind-mounted at /root/perf-tests/clusterloader2/config + # by run_cl2_command). If neither PATH kubectl nor the pre-staged binary + # is available, fail with a clear diagnostic. + PREBAKED_KUBECTL=/root/perf-tests/clusterloader2/config/kubectl + if [ -x "${PREBAKED_KUBECTL}" ]; then + KUBECTL_BIN_DIR="$(dirname "${PREBAKED_KUBECTL}")" + export PATH="${KUBECTL_BIN_DIR}:${PATH}" + echo "killer: using pre-staged kubectl at ${PREBAKED_KUBECTL}" + else + echo "killer ERROR: kubectl not in PATH inside CL2 container; "\ + "pre-staged binary at ${PREBAKED_KUBECTL} is also missing — "\ + "verify execute.yml pre-stage step ran successfully" + echo "killer ERROR: PATH=$PATH" + exit 127 + fi +fi + +KUBECTL_CLIENT_INFO="$(kubectl version --client=true --output=yaml 2>&1 | head -3 || true)" +echo "killer: kubectl client info:" +echo "${KUBECTL_CLIENT_INFO}" +echo "killer: starting (duration=${KILL_DURATION_SECONDS}s interval=${KILL_INTERVAL_SECONDS}s batch=${KILL_BATCH} selector=${LABEL_SELECTOR})" + +# shuf is GNU coreutils; not guaranteed in every image base. Fall back to +# awk-with-srand when missing — awk is part of POSIX and always available. +HAS_SHUF=0 +if command -v shuf >/dev/null 2>&1; then + HAS_SHUF=1 +fi + +random_pick() { + # Reads "ns/name" lines on stdin, prints up to $1 random lines. + local n="$1" + if [ "${HAS_SHUF}" -eq 1 ]; then + shuf | head -n "$n" + else + awk -v n="$n" 'BEGIN{srand()} {print rand()" "$0}' | sort -k1,1n | head -n "$n" | cut -d" " -f2- + fi +} + +END_EPOCH=$(( $(date +%s) + KILL_DURATION_SECONDS )) +ROUND=0 +KILLED_TOTAL=0 + +while [ "$(date +%s)" -lt "${END_EPOCH}" ]; do + ROUND=$((ROUND + 1)) + + CANDIDATES="$(kubectl get pods -A -l "${LABEL_SELECTOR}" \ + -o 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" + + if [ -z "${CANDIDATES}" ]; then + echo "killer: round=${ROUND} no candidates matched selector ${LABEL_SELECTOR}" + else + TARGETS="$(printf '%s\n' "${CANDIDATES}" | random_pick "${KILL_BATCH}")" + ROUND_KILLED=0 + while IFS= read -r nsname; do + [ -z "${nsname}" ] && continue + ns="${nsname%%/*}" + name="${nsname##*/}" + # --grace-period=0 + --force: immediate evict, no graceful shutdown + # wait. Simulates a "node failure"-style event for the pod-event + # propagation path. --ignore-not-found tolerates the inherent race + # where ReplicaSet has not yet replaced previous round's kills. + if kubectl delete pod -n "${ns}" "${name}" \ + --grace-period=0 --force --ignore-not-found \ + > /dev/null 2>&1; then + ROUND_KILLED=$((ROUND_KILLED + 1)) + fi + done <<< "${TARGETS}" + KILLED_TOTAL=$((KILLED_TOTAL + ROUND_KILLED)) + echo "killer: round=${ROUND} killed=${ROUND_KILLED} cumulative=${KILLED_TOTAL}" + fi + + # Don't sleep past the deadline. + NOW="$(date +%s)" + REMAINING=$(( END_EPOCH - NOW )) + if [ "${REMAINING}" -le 0 ]; then + break + fi + SLEEP="${KILL_INTERVAL_SECONDS}" + if [ "${REMAINING}" -lt "${SLEEP}" ]; then + SLEEP="${REMAINING}" + fi + sleep "${SLEEP}" +done + +echo "killer: done duration=${KILL_DURATION_SECONDS}s rounds=${ROUND} cumulative=${KILLED_TOTAL}" +exit 0 diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-scale.yaml b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-scale.yaml new file mode 100644 index 0000000000..de791616b8 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-scale.yaml @@ -0,0 +1,284 @@ +name: clustermesh-pod-churn-scale + +# Scale scenario #2 (Pod Churn Stress Test) — deterministic scale-cycle variant. +# +# Spec (scale testing.txt line 55-67): "Validate stability under high pod churn. +# Repeatedly scale deployments up/down. Track propagation latency, missed or +# delayed updates, CPU/memory growth over time." +# +# This scenario cycles each Deployment's .spec.replicas between $replicasPerDeployment +# and 0 for $churnCycles iterations, holding each end-state for $churnUpDuration / +# $churnDownDuration respectively. The cycle drives a steady-state stream of pod +# create/delete events without churning Deployment or Service objects (those stay +# present across all cycles), isolating the pod-event signal. +# +# Sequence: +# 1. Start measurements (control-plane, cilium, clustermesh-metrics, +# clustermesh-throughput, etcd-metrics, pod-churn-stress). +# 2. Deploy PodMonitor (clustermesh.yaml). +# 3. Initial workload apply at full replicas + WaitForControlledPodsRunning gate +# (proves the workload settled before churn begins). +# 4. Churn loop ($churnCycles iterations): +# a. Scale-down to replicas=0 (no wait — let it churn freely). +# b. Sleep $churnDownDuration. +# c. Scale-up to replicas=$replicasPerDeployment. +# d. Sleep $churnUpDuration. +# 5. Final scale-up (idempotent — guarantees known terminal state) + final +# WaitForControlledPodsRunning.gather for convergence. +# 6. Settle sleep ($holdDuration) — lets kvstore queues drain and slope queries +# observe the post-churn settle. +# 7. Gather measurements (mirror start order). +# 8. Teardown (delete workload + PodMonitor). + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}} +{{$churnCycles := DefaultParam .CL2_CHURN_CYCLES 5}} +{{$churnUpDuration := DefaultParam .CL2_CHURN_UP_DURATION "60s"}} +{{$churnDownDuration := DefaultParam .CL2_CHURN_DOWN_DURATION "60s"}} + +{{$group := "clustermesh-pod-churn-scale"}} +{{$basename := "pcs"}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-pcs + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- ACNS namespace opt-in (CFP-39876) ----- + # AKS-managed Cilium ships with clustermesh-default-global-namespace=false, + # so workload namespaces need clustermesh.cilium.io/global=true to sync + # their CiliumIdentity/Endpoint/Services across the mesh. Without this, + # cross-cluster propagation metrics are structurally 0. See plan.md + # note #14 + ACNS team confirmation 2026-05-11. + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-pcs" + + # ----- Start measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Initial workload create + settle ----- + # WaitForControlledPodsRunning is registered ONCE here and gathered ONCE at + # the end of the churn loop. Per-cycle waits would block the cycle until + # pods settled, defeating the "rapid churn" intent of scenario #2. + - name: Start tracking pod-churn-scale Deployments + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-scale + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - name: Wait for initial pod-churn pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-scale + Method: WaitForControlledPodsRunning + Params: + action: gather + + # ----- Warmup before churn ----- + - name: Warmup before churn + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- Re-register a fresh watcher for the churn window so the final gather + # only reflects the churn loop's outcome, not the initial create. ----- + - name: Start tracking pod-churn loop + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-loop + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = {{$group}} + operationTimeout: {{$operationTimeout}} + + # ----- Churn loop ----- + # CL2's `Loop $N` template func yields 0..N-1; we emit $churnCycles pairs of + # scale-down → sleep → scale-up → sleep. No per-cycle WaitForControlledPodsRunning: + # we WANT the system in flux during this window so the measurements observe + # sustained churn rather than per-cycle settle-and-spike. + {{range $i := Loop $churnCycles}} + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - name: Pod-churn cycle {{$i}} — down hold + measurements: + - Identifier: ChurnCycleDownSleep-{{$i}} + Method: Sleep + Params: + duration: {{$churnDownDuration}} + + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: apply + replicas: {{$replicasPerDeployment}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - name: Pod-churn cycle {{$i}} — up hold + measurements: + - Identifier: ChurnCycleUpSleep-{{$i}} + Method: Sleep + Params: + duration: {{$churnUpDuration}} + {{end}} + + # ----- Final convergence: end the churn window at a known terminal state. ----- + - name: Wait for post-churn pods to be Running + measurements: + - Identifier: WaitForControlledPodsRunning-pod-churn-loop + Method: WaitForControlledPodsRunning + Params: + action: gather + + # ----- Settle: let kvstore queues drain post-churn ----- + - name: Settle after churn + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: {{$group}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: gather + + # ----- Teardown: drop Deployments + Services. ----- + - module: + path: /modules/pod-churn-workload.yaml + params: + actionName: delete + replicas: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: DeploymentCreateQps + group: {{$group}} + basename: {{$basename}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/upper-bound.yaml b/modules/python/clusterloader2/clustermesh-scale/config/upper-bound.yaml new file mode 100644 index 0000000000..3d7fa9e4d5 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/upper-bound.yaml @@ -0,0 +1,329 @@ +name: clustermesh-upper-bound + +# Scale scenario #6: Upper Bound / Saturation Testing. +# +# Goal (scale testing.txt line 103-114): Find system limits safely. +# - Increasing clusters → covered by the matrix (n2/n5/n10/n20 +# entries each run this same CL2 config). +# - Increasing events per → covered IN-RUN by ramping through N +# cluster "rungs" of progressively heavier load. +# - Record failure modes, → scale.py collect's saturation classifier +# not just thresholds tags each rung with the dominant signal +# ({clean, latency_spike, queue_unbounded, +# cpu_exhaust, mesh_failure_burst, +# etcd_tail}). See _emit_saturation_profile_rows. +# +# Per-rung structure (single CL2 invocation per cluster runs the full +# ramp; bounded sweep, not adaptive stress-to-fail — see the rubber-duck +# review notes in plan.md's Scenario #6 section): +# +# For rung r in 0..N-1: +# 1. Start measurements with suffix=Rung (per-rung time window via +# CL2's %v placeholder; suffix namespaces the emitted JSONs so the +# Python collector can read them per-rung). +# 2. Restart-burst the workload at TuningSet qps = qps_list[r], doing +# restarts_list[r] consecutive restart cycles. Each restart bumps a +# Deployment pod-template annotation, which triggers a rolling +# recreate of every replica → forces a flurry of endpoint/identity +# events through clustermesh-apiserver. +# 3. Sleep rung_duration so the measurement window covers the burst +# AND the steady-state right after. CL2's gather queries (action: +# gather) substitute %v with the wall time since the matching +# action: start — so a longer rung_duration captures more of the +# post-burst tail. +# 4. Gather measurements with suffix=Rung. +# 5. Sleep settle_duration before the next rung. The settle window is +# sized so kvstore queues from rung r drain before rung r+1 starts. +# +# After all rungs, delete the workload + PodMonitor. +# +# IMPORTANT design notes (don't change without re-reading rubber-duck +# critique notes in plan.md): +# - Single CL2 invocation per cluster, NOT N separate invocations. Keeps +# one Prometheus time-axis consistent across rungs; cross-rung +# comparison is cleaner; avoids 5× the workload-create-teardown cost. +# - QPS alone doesn't drive kvstore events 1:1 — each rung also bumps +# `restartsPerRung` so cumulative events scale with rung index even +# when QPS saturates CL2's Deployment-apply rate. Both dials are +# driven by the matrix vars. +# - The classifier verdict is computed at collect time from the per-rung +# measurement JSONs, NOT inside CL2. Raw signal values + thresholds + +# classifier_version are emitted alongside verdicts so dashboards can +# recompute verdicts post-hoc if thresholds need calibration. +# - NOT share-infra-eligible in v1 — a tripped rung can leave queue/memory +# residue that would contaminate following scenarios. Standalone matrix +# entries only until baseline data justifies share-infra positioning. +# - CL2's template engine has its OWN func map (see kubernetes/perf-tests +# clusterloader2/pkg/config/template_functions.go); sprig is NOT +# available. Use StringSplit, Loop, AddInt, MultiplyInt, SubtractInt, +# index, len. atoi is implicit — arithmetic funcs accept string args +# and parse via toFloat64. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} + +# Saturation knobs. SaturationQpsList is a comma-separated list of QPS +# values, one per rung. SaturationRestartsList is the per-rung restart +# count (length must match SaturationQpsList) — driven separately so +# dashboards can distinguish "QPS axis" from "workload-amplitude axis". +# Each rung lasts SaturationRungDurationSeconds + SaturationSettleSeconds. +# +# Defaults match scale.py's defaults so a forgotten matrix var falls +# through to a 5-rung sweep at 100/500/1500/4000/10000 QPS with +# 5/15/40/80/150 restarts per rung (5 rungs × (240s hold + 90s settle) +# ≈ 28 min CL2 wall time per cluster). Bumped 2026-05-15 after build +# 67224 showed all signals at 1-15% of thresholds at the prior 4-rung +# 20/40/80/160 sweep — actual saturation knee lies higher. +{{$saturationQpsListStr := DefaultParam .CL2_SATURATION_QPS_LIST "100,500,1500,4000,10000"}} +{{$saturationRestartsListStr := DefaultParam .CL2_SATURATION_RESTARTS_LIST "2,4,8,15,25"}} +{{$saturationRungDurationSeconds := DefaultParam .CL2_SATURATION_RUNG_DURATION_SECONDS 240}} +{{$saturationSettleSeconds := DefaultParam .CL2_SATURATION_SETTLE_SECONDS 90}} + +# Parse comma-separated strings into Go []string slices. StringSplit is +# CL2's built-in. The arithmetic funcs (AddInt, MultiplyInt, etc.) accept +# string args and parse them via toFloat64, so we can pass slice elements +# directly without an atoi step. +{{$qpsList := StringSplit $saturationQpsListStr}} +{{$restartsList := StringSplit $saturationRestartsListStr}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-ub + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + # Initial workload-create QPS is fixed at the first rung's QPS — every + # cluster brings the baseline workload up at the gentle rung-0 rate so + # the create-flurry doesn't itself trip saturation before the ramp + # starts. Saturation rungs use their own per-rung TuningSets defined + # below. + - name: WorkloadCreateQps + qpsLoad: + qps: {{index $qpsList 0}} + # One TuningSet per rung. CL2 template ranges over $qpsList and emits + # Rung0Qps, Rung1Qps, ... TuningSets that the workload module references + # by name via the matching $tuningSet param below. + {{range $i, $qps := $qpsList}} + - name: Rung{{$i}}Qps + qpsLoad: + qps: {{$qps}} + {{end}} + +steps: + # ----- ACNS namespace opt-in (CFP-39876) ----- + # Identical to event-throughput.yaml — required for cross-cluster sync + # to fire at all. See plan.md note #14. + - name: Annotate workload namespaces for ACNS cross-cluster sync + measurements: + - Identifier: AnnotateNamespacesForGlobalSync + Method: Exec + Params: + streamOutput: true + timeout: 1m + command: + - bash + - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh + - "{{$namespaces}}" + - "clustermesh-ub" + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: WorkloadCreateQps + + # ----- Baseline workload create ----- + # Done OUTSIDE the rung loop so the create cost (which depends on + # cluster cold-start, image pulls, scheduling) isn't conflated with + # rung-0's restart-burst signal. After create, every rung exercises + # the same population of Deployments via restart bursts. + - module: + path: /modules/event-throughput-workload.yaml + params: + actionName: create + generation: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: WorkloadCreateQps + operationTimeout: {{$operationTimeout}} + phaseSuffix: Create + + # 30s pre-rung settle: lets the create-flurry's residual kvstore traffic + # drain before rung 0 starts measuring. Without this, rung 0's baseline + # carries spillover from the create burst and looks artificially loaded. + - name: Pre-rung settle (drain create-flurry) + measurements: + - Identifier: PreRungSettle + Method: Sleep + Params: + duration: 30s + + # ----- Saturation rung loop ----- + # Each rung: start measurements with Rung suffix → restart-burst the + # workload restartsList[i] times at qpsList[i] QPS → sleep rung duration + # so the gather window captures both burst and tail → gather measurements + # → settle before next rung. + # + # Restart generations are offset per rung by 1000*(rung+1) so the + # pod-template annotation values are strictly monotonic across rungs + # (avoids a rollout being skipped because the same generation was used + # in a prior rung). + {{range $i, $qps := $qpsList}} + + # ===== Rung {{$i}} (qps={{$qps}}, restarts={{index $restartsList $i}}) ===== + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: clustermesh-upper-bound-rung{{$i}} + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: start + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: start + suffix: Rung{{$i}} + + # Rung {{$i}} workload: restart-burst the population N times. Each + # restart bumps the pod-template annotation to a unique generation so + # the rolling-recreate fires. Generation = 1000*(rung+1) + r so cross- + # rung values never collide. + {{range $r := Loop (index $restartsList $i)}} + - module: + path: /modules/event-throughput-workload.yaml + params: + actionName: restart + generation: {{AddInt (MultiplyInt 1000 (AddInt $i 1)) $r}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: Rung{{$i}}Qps + operationTimeout: {{$operationTimeout}} + phaseSuffix: Rung{{$i}}Restart{{$r}} + {{end}} + + # Rung-{{$i}} hold: keep the measurement window open after the burst so + # the gather queries capture peak + tail. CL2's %v in queries resolves + # to the wall time since the matching `start`, so this Sleep determines + # the measurement window width for rung {{$i}}. + - name: Rung {{$i}} hold (qps={{$qps}}, restarts={{index $restartsList $i}}) + measurements: + - Identifier: SaturationRung{{$i}}Hold + Method: Sleep + Params: + duration: {{$saturationRungDurationSeconds}}s + + # Gather rung-{{$i}} measurements. The suffix=Rung{{$i}} param threads + # through every GenericPrometheusQuery's Identifier and metricName so + # the emitted JSONs are uniquely named per rung. scale.py collect reads + # them back by matching the Rung suffix. + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: clustermesh-upper-bound-rung{{$i}} + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: gather + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + suffix: Rung{{$i}} + + - module: + path: /modules/measurements/pod-churn-stress.yaml + params: + action: gather + suffix: Rung{{$i}} + + # Inter-rung settle: drain kvstore queues from rung {{$i}} before the + # next rung starts. Without this, the next rung's baseline carries + # rung-{{$i}}'s spillover. 60s is enough at low rungs; at the highest + # rungs the spillover may exceed settle and the next rung's verdict + # will be biased "worse" — that's fine, it captures cumulative system + # stress correctly. + - name: Rung {{$i}} settle + measurements: + - Identifier: SaturationRung{{$i}}Settle + Method: Sleep + Params: + duration: {{$saturationSettleSeconds}}s + + {{end}} + # ----- end of rung loop ----- + + # ----- Workload + PodMonitor teardown ----- + # Use a generation strictly greater than any rung's max generation + # (1000 * (max_rung+1) + max_restart_in_that_rung) so the delete-time + # pod-template doesn't accidentally match a prior rung's template + # and skip the rolling cleanup. With defaults (4 rungs × max 4 restarts) + # max rung generation = 1000*4 + 3 = 4003; we use 999999 which is well + # above any plausible matrix-configured value. + - module: + path: /modules/event-throughput-workload.yaml + params: + actionName: delete + generation: 999999 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: WorkloadCreateQps + operationTimeout: {{$operationTimeout}} + phaseSuffix: Delete + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: WorkloadCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/write-ready-sentinel.sh b/modules/python/clusterloader2/clustermesh-scale/config/write-ready-sentinel.sh new file mode 100755 index 0000000000..a020aad9d6 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/write-ready-sentinel.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# CL2 ready-sentinel writer for Scenario #3 (Node Churn / IP Churn). +# +# Why a separate script and not inline `bash -c` in the CL2 yaml: +# The first iteration used `command: [bash, -c, |]` in the CL2 +# Method:Exec block, with `CTX=$(kubectl config current-context)`. Build +# 67114 showed `kubectl config current-context` returning EMPTY in the CL2 +# docker image's environment (verified by `Exec command output: wrote +# sentinel ready-` — context suffix was empty). Both clusters then wrote +# the SAME path (sentinels/ready-) and one overwrote the other → barrier +# saw 1/2 sentinels → quorum never reached → scenario aborted. +# +# This script is mounted into the CL2 container at +# /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh by virtue +# of being a sibling of pod-churn-killer.sh / annotate-namespaces.sh / +# apiserver-failure-killer.sh (the CL2_CONFIG_DIR bind-mount). Same +# pattern, proven across scenarios #2/#4/#5/#7. +# +# Context-name resolution (multi-fallback for robustness): +# 1. Parse `current-context:` from /root/.kube/config directly (the +# file is bind-mounted by run_cl2_command from the host's per-cluster +# kubeconfig). YAML-safe grep + awk; no kubectl dependency. +# 2. `kubectl config current-context` via PATH kubectl. +# 3. Pre-staged kubectl at /root/perf-tests/clusterloader2/config/kubectl. +# 4. Hash of the kubeconfig server URL — guaranteed unique across +# clusters in this mesh (different AKS APIServer URLs). +# 5. Hostname of the pod (CL2 pods get pod-name-suffixed). Last resort. +# +# All diagnostic output goes to STDERR so CL2 streamOutput captures it for +# postmortem. STDOUT only emits the final sentinel path. +# +# Positional args: +# $1 SENTINEL_DIR (required) absolute path; sentinel file lands here + +set -uo pipefail + +SENTINEL_DIR="${1:?sentinel dir required}" +mkdir -p "$SENTINEL_DIR" + +KUBECONFIG_PATH="${KUBECONFIG:-/root/.kube/config}" +PRE_STAGED_KUBECTL="/root/perf-tests/clusterloader2/config/kubectl" + +dbg() { + # Diagnostic logging to stderr — captured by CL2 streamOutput. + echo "write-ready-sentinel: $*" >&2 +} + +CTX="" +RESOLVED_BY="" + +# Method 1: parse kubeconfig directly. +if [ -f "$KUBECONFIG_PATH" ]; then + CTX=$(grep -E '^current-context:' "$KUBECONFIG_PATH" 2>/dev/null \ + | head -1 | awk '{print $2}' | tr -d '"' | tr -d "'" || echo "") + if [ -n "$CTX" ]; then + RESOLVED_BY="kubeconfig-parse" + fi +fi + +# Method 2: PATH kubectl. +if [ -z "$CTX" ] && command -v kubectl >/dev/null 2>&1; then + CTX=$(kubectl config current-context 2>/dev/null || echo "") + if [ -n "$CTX" ]; then + RESOLVED_BY="kubectl-PATH" + fi +fi + +# Method 3: pre-staged kubectl. +if [ -z "$CTX" ] && [ -x "$PRE_STAGED_KUBECTL" ]; then + CTX=$("$PRE_STAGED_KUBECTL" config current-context 2>/dev/null || echo "") + if [ -n "$CTX" ]; then + RESOLVED_BY="kubectl-prestaged" + fi +fi + +# Method 4: hash of server URL (deterministic per cluster; collision-safe +# across the mesh because every AKS has a unique FQDN). +if [ -z "$CTX" ] && [ -f "$KUBECONFIG_PATH" ]; then + _server=$(grep -E '^\s*server:' "$KUBECONFIG_PATH" 2>/dev/null | head -1 \ + | awk '{print $2}' || echo "") + if [ -n "$_server" ]; then + if command -v sha256sum >/dev/null 2>&1; then + _hash=$(echo -n "$_server" | sha256sum | cut -c1-8) + elif command -v md5sum >/dev/null 2>&1; then + _hash=$(echo -n "$_server" | md5sum | cut -c1-8) + else + _hash=$(echo -n "$_server" | od -A n -t x1 | tr -d ' \n' | cut -c1-8) + fi + CTX="srv-${_hash}" + RESOLVED_BY="server-hash" + fi +fi + +# Method 5: pod hostname (CL2 runs each cluster's CL2 in a separate +# docker container with a unique hostname). +if [ -z "$CTX" ]; then + CTX="$(hostname 2>/dev/null || echo "unknown-$$")" + RESOLVED_BY="hostname" +fi + +# DIAGNOSTIC DUMP — always print state so postmortem on quorum failure +# can identify why context was hard to resolve. +dbg "===== CL2 ready-sentinel diagnostic =====" +dbg "resolved context = '${CTX}' via ${RESOLVED_BY}" +dbg "KUBECONFIG=${KUBECONFIG_PATH} exists=$( [ -f "$KUBECONFIG_PATH" ] && echo yes || echo no )" +if [ -f "$KUBECONFIG_PATH" ]; then + dbg "kubeconfig current-context line: $(grep -E '^current-context:' "$KUBECONFIG_PATH" | head -1 || echo '(none)')" + dbg "kubeconfig server line: $(grep -E '^\s*server:' "$KUBECONFIG_PATH" | head -1 || echo '(none)')" +fi +dbg "PATH=${PATH:-}" +dbg "PATH kubectl: $(command -v kubectl || echo '(none)')" +dbg "pre-staged kubectl exists+exec: $( [ -x "$PRE_STAGED_KUBECTL" ] && echo yes || echo no )" +dbg "hostname: $(hostname 2>/dev/null || echo '(none)')" +dbg "sentinel dir: ${SENTINEL_DIR}" +dbg "================================================" + +# Guard: empty context after every fallback would still cause a path +# collision. Emit a unique fallback name using $$ (PID, unique-per-process). +if [ -z "$CTX" ]; then + CTX="unresolved-$$" + dbg "ERROR: every fallback returned empty; using ${CTX}" +fi + +SENTINEL_FILE="${SENTINEL_DIR}/ready-${CTX}" +touch "$SENTINEL_FILE" +dbg "wrote sentinel ${SENTINEL_FILE}" +echo "$SENTINEL_FILE" +exit 0 diff --git a/modules/python/clusterloader2/clustermesh-scale/scale.py b/modules/python/clusterloader2/clustermesh-scale/scale.py index 35047f122a..56c623083d 100644 --- a/modules/python/clusterloader2/clustermesh-scale/scale.py +++ b/modules/python/clusterloader2/clustermesh-scale/scale.py @@ -1,11 +1,17 @@ """ ClusterMesh scale-test harness. -Single-cluster invocation. The Telescope pipeline fans out by calling this -script once per fleet member (driven by `az fleet clustermeshprofile list-members` -in steps/topology/clustermesh-scale/execute-clusterloader2.yml). Each invocation -emits one JSONL with a `cluster` attribution column so concatenated results from -N clusters are queryable per-cluster downstream. +Per-cluster execute (`scale.py execute`) is single-cluster: it spawns one +ClusterLoader2 docker container against one kubeconfig. The Telescope pipeline +fans out across N clusters; each per-cluster invocation emits one JSONL with a +`cluster` attribution column so concatenated results from N clusters are +queryable per-cluster downstream. + +Multi-cluster fan-out (`scale.py execute-parallel`, Phase 3) bounds parallel +CL2 invocations across the mesh — see `execute_parallel` below for the worker +model. Each parallel worker shells out to `run-cl2-on-cluster.sh` so the +existing per-iteration bash semantics (CL2 run + junit gate + log capture + +failure diag) are preserved exactly per cluster. Phase 1 is intentionally trivial: deploy a small fixed number of pods, no churn, no fortio, no network policies. The goal of Phase 1 is to prove the multi-cluster @@ -15,19 +21,91 @@ parameters to configure/collect. """ import argparse +import concurrent.futures import json import os +import signal +import subprocess +import sys +import tempfile +import threading from datetime import datetime, timezone from clusterloader2.utils import parse_xml_to_json, run_cl2_command, process_cl2_reports +# Phase 4b — Scenario #6 (Upper Bound / Saturation) classifier constants. +# Versioned so downstream Kusto dashboards can compare verdicts across +# tuning iterations. Raw signal values + thresholds are emitted alongside +# the verdict so dashboards can recompute verdicts post-hoc without re- +# running the test if thresholds need calibration. +# +# Thresholds rationale (v1 — first-smoke calibration; revisit after first +# n=2 green): +# latency_p99_ms — 500ms p99 of cilium_kvstoremesh_kvstore_ +# operations_duration. Healthy AKS-managed +# Cilium runs show p99 < 100ms; 5× that is +# the saturation knee. +# queue_size_perc99 — 1000 in cilium_kvstoremesh_kvstore_sync_ +# queue_size. Steady-state on green pod-churn +# runs is single digits; 3 orders of magnitude +# above noise floor is unambiguously bad. +# apiserver_max_cpu_cores — 1.5 cores per clustermesh-apiserver pod +# (ClusterMeshApiserverPodCPU PerPodMax). +# AKS-managed Cilium typically requests +# 0.5-1.0 vCPU; saturated >2× allocation = at +# risk of throttling. +# mesh_failure_rate_max — 0.5 reconnect-failures/s. Plan.md deferred +# decision #6 documents the green-run +# baseline of 4-6 reconnects per 36 min run +# ≈ 0.003/s (uniformly distributed across +# peers, benign Fleet churn). 0.5/s = ~150× +# that baseline → real failure burst. +# etcd_commit_p99_ms — 200ms p99 of etcd_debugging_disk_backend_ +# commit_write_duration. Etcd's design target +# is single-digit ms; 200ms = backed-up disk +# subsystem. +SATURATION_CLASSIFIER_VERSION = "saturation-v1" +SATURATION_THRESHOLDS = { + "latency_p99_ms": 500.0, + "queue_size_perc99": 1000.0, + "apiserver_max_cpu_cores": 1.5, + "mesh_failure_rate_max": 0.5, + "etcd_commit_p99_ms": 200.0, +} + + def configure_clusterloader2( namespaces, deployments_per_namespace, replicas_per_deployment, operation_timeout, override_file, + churn_cycles=5, + churn_up_duration="60s", + churn_down_duration="60s", + kill_duration="10m", + kill_interval_seconds=10, + kill_batch=5, + kill_duration_seconds=600, + kill_job_deadline_seconds=660, + apiserver_kill_target_context="clustermesh-1", + apiserver_kill_recovery_timeout_seconds=240, + apiserver_kill_observation_seconds=60, + ha_config_replicas=3, + node_churn_target_context="clustermesh-1", + node_churn_cycles=3, + node_churn_delta=5, + node_churn_settle_seconds=60, + node_churn_scale_duration_seconds=1800, + node_churn_replace_duration_seconds=1500, + node_churn_combined_duration_seconds=3300, + node_replace_batch_size=10, + node_churn_ready_timeout_seconds=300, + saturation_qps_list="100,500,1500,4000,10000", + saturation_restarts_list="2,4,8,15,25", + saturation_rung_duration_seconds=240, + saturation_settle_seconds=90, ): with open(override_file, "w", encoding="utf-8") as f: # Prometheus stack — keep the Cilium-scrape flags ON so the @@ -38,7 +116,16 @@ def configure_clusterloader2( # IS honored as an overrides key and must be >= the request to satisfy # k8s admission. f.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") - f.write("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi\n") + # Prometheus memory limit. Bumped 2Gi\u21924Gi 2026-05-15 after build + # 67224 showed prometheus-k8s-0 in CrashLoopBackOff on saturation + # runs. Then bumped 4Gi\u219212Gi 2026-05-15 after build 67279 + # showed Prom STILL OOM'ing at Rung 2 even with 4Gi when the + # restart-burst workload pushed too many series/samples. + # D8ds_v4 prompool has 32GB RAM so 12Gi is safe with headroom. + # CL2_PROMETHEUS_MEMORY_LIMIT is honored as a CL2 overrides key + # (unlike the *_FACTOR knobs which are silently broken — see + # plan.md "What we built" item 16). + f.write("CL2_PROMETHEUS_MEMORY_LIMIT: 12Gi\n") # Pin Prometheus to the dedicated `prompool` node (label # prometheus=true is set in azure-2.tfvars extra_node_pool). Without # this, prometheus-k8s lands on the default workload pool and @@ -62,6 +149,65 @@ def configure_clusterloader2( f.write(f"CL2_REPLICAS_PER_DEPLOYMENT: {replicas_per_deployment}\n") f.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n") + # Phase 4a — Scenario #2 (Pod Churn Stress) knobs. + # Written unconditionally with defaults so an event-throughput run + # (which doesn't reference these CL2_* params in its template) + # silently ignores them. CL2 does not fail on unknown overrides + # keys, so the cost is a few lines of YAML noise per non-churn run. + # The alternative — splitting configure into per-scenario + # subcommands — would proliferate harness surface area; see + # plan.md Phase 4a notes. + f.write(f"CL2_CHURN_CYCLES: {churn_cycles}\n") + f.write(f"CL2_CHURN_UP_DURATION: {churn_up_duration}\n") + f.write(f"CL2_CHURN_DOWN_DURATION: {churn_down_duration}\n") + f.write(f"CL2_KILL_DURATION: {kill_duration}\n") + f.write(f"CL2_KILL_INTERVAL_SECONDS: {kill_interval_seconds}\n") + f.write(f"CL2_KILL_BATCH: {kill_batch}\n") + f.write(f"CL2_KILL_DURATION_SECONDS: {kill_duration_seconds}\n") + f.write(f"CL2_KILL_JOB_DEADLINE_SECONDS: {kill_job_deadline_seconds}\n") + + # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs. + # Same unconditional-write pattern as the pod-churn knobs above: + # CL2 templates that don't reference these silently ignore. Allows + # share-infra runs where multiple scenarios share one overrides.yaml. + f.write(f"CL2_APISERVER_KILL_TARGET_CONTEXT: {apiserver_kill_target_context}\n") + f.write(f"CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS: {apiserver_kill_recovery_timeout_seconds}\n") + f.write(f"CL2_APISERVER_KILL_OBSERVATION_SECONDS: {apiserver_kill_observation_seconds}\n") + + # Phase 4b — Scenario #7 (HA Configuration Validation) knob. + # Single replicas-count override consumed by ha-config.yaml. Other + # scenarios' CL2 configs don't reference it; ignored silently. + f.write(f"CL2_HA_CONFIG_REPLICAS: {ha_config_replicas}\n") + + # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs. + # node-churn-{scale,replace,combined}.yaml each consume a subset. + # node-churner.sh (driven from execute.yml, NOT Method:Exec — CL2 + # image has no az CLI) reads the same matrix vars directly; these + # overrides drive the CL2-side sleep/sentinel window that aligns + # with the churner's wall-clock run. + f.write(f"CL2_NODE_CHURN_TARGET_CONTEXT: {node_churn_target_context}\n") + f.write(f"CL2_NODE_CHURN_CYCLES: {node_churn_cycles}\n") + f.write(f"CL2_NODE_CHURN_DELTA: {node_churn_delta}\n") + f.write(f"CL2_NODE_CHURN_SETTLE_SECONDS: {node_churn_settle_seconds}\n") + f.write(f"CL2_NODE_CHURN_SCALE_DURATION_SECONDS: {node_churn_scale_duration_seconds}\n") + f.write(f"CL2_NODE_CHURN_REPLACE_DURATION_SECONDS: {node_churn_replace_duration_seconds}\n") + f.write(f"CL2_NODE_CHURN_COMBINED_DURATION_SECONDS: {node_churn_combined_duration_seconds}\n") + f.write(f"CL2_NODE_REPLACE_BATCH_SIZE: {node_replace_batch_size}\n") + f.write(f"CL2_NODE_CHURN_READY_TIMEOUT_SECONDS: {node_churn_ready_timeout_seconds}\n") + + # Phase 4b — Scenario #6 (Upper Bound / Saturation) knobs. + # upper-bound.yaml CL2 config consumes these to drive the per-rung + # QPS ramp + restart amplitude. Written unconditionally with the + # same defaulted-pattern as scenario #2-#5 knobs: non-saturation + # CL2 configs simply ignore them (CL2 doesn't fail on unknown + # overrides keys). The qps and restarts lists are written as + # comma-separated strings; upper-bound.yaml uses CL2's + # StringSplit template func to parse. + f.write(f"CL2_SATURATION_QPS_LIST: \"{saturation_qps_list}\"\n") + f.write(f"CL2_SATURATION_RESTARTS_LIST: \"{saturation_restarts_list}\"\n") + f.write(f"CL2_SATURATION_RUNG_DURATION_SECONDS: {saturation_rung_duration_seconds}\n") + f.write(f"CL2_SATURATION_SETTLE_SECONDS: {saturation_settle_seconds}\n") + with open(override_file, "r", encoding="utf-8") as f: print(f"Content of file {override_file}:\n{f.read()}") @@ -73,6 +219,7 @@ def execute_clusterloader2( cl2_config_file, kubeconfig, provider, + tear_down_prometheus=False, ): run_cl2_command( kubeconfig, @@ -83,7 +230,13 @@ def execute_clusterloader2( cl2_config_file=cl2_config_file, overrides=True, enable_prometheus=True, - tear_down_prometheus=False, + # Default False preserves the diagnostic-on-failure capability — when + # CL2 fails, run-cl2-on-cluster.sh's FAILURE DIAG block can dump + # prometheus-operator + prometheus-k8s pod logs. Set True in + # share-infra mode (multi-scenario per lifecycle) so each scenario's + # CL2 invocation gets a clean Prometheus deploy and the previous + # scenario's PodMonitor/scrape config doesn't bleed in. + tear_down_prometheus=tear_down_prometheus, scrape_kubelets=True, scrape_ksm=True, scrape_metrics_server=True, @@ -97,6 +250,228 @@ def execute_clusterloader2( ) +# Module-level lock + Popen tracking for execute_parallel. Lock keeps log lines +# atomic across worker threads; the Popen list lets a SIGINT/SIGTERM handler +# terminate live children on cancel (AzDO step cancel, Ctrl-C in dev). +_PARALLEL_STDOUT_LOCK = threading.Lock() +_PARALLEL_LIVE_POPENS = [] +_PARALLEL_LIVE_POPENS_LOCK = threading.Lock() + + +def _emit_prefixed_line(role, line): + # AzDO recognizes ##vso[...] service messages only when they appear at + # column 0 — prefixing them would drop the structured annotation. Emit + # those unprefixed; everything else gets the [role] tag for readability + # under interleaved output. + if line.startswith("##"): + out = line + else: + out = f"[{role}] {line}" + with _PARALLEL_STDOUT_LOCK: + sys.stdout.write(out) + sys.stdout.flush() + + +def _run_one_cluster(role, worker_script, worker_args, env=None): + """Spawn the per-cluster worker script and stream its merged stdout/stderr. + + Returns (role, exit_code). Exit code is the worker script's exit (which + is the authoritative pass/fail per cluster — the script does its own + junit gate + log capture + failure diag). + """ + cmd = ["bash", worker_script, role, *worker_args] + # bufsize=1 + text=True gives us line-buffered text reads so the prefix + # writer sees one CL2 log line at a time. PYTHONUNBUFFERED ensures the + # nested python3 scale.py execute child also flushes per-line. + child_env = os.environ.copy() + if env: + child_env.update(env) + child_env.setdefault("PYTHONUNBUFFERED", "1") + # Not using `with subprocess.Popen(...)` because the Popen handle is + # registered in _PARALLEL_LIVE_POPENS for the SIGINT/SIGTERM handler; + # `with` would close stdout at function exit and cancel signal-based + # termination semantics. The try/finally below handles cleanup. + proc = subprocess.Popen( # pylint: disable=consider-using-with + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=1, + text=True, + env=child_env, + ) + with _PARALLEL_LIVE_POPENS_LOCK: + _PARALLEL_LIVE_POPENS.append(proc) + try: + assert proc.stdout is not None + for line in proc.stdout: + _emit_prefixed_line(role, line) + proc.wait() + finally: + with _PARALLEL_LIVE_POPENS_LOCK: + try: + _PARALLEL_LIVE_POPENS.remove(proc) + except ValueError: + pass + return role, proc.returncode + + +def _install_parallel_signal_handlers(): + """Terminate live worker subprocesses on SIGINT/SIGTERM. + + AzDO step cancel sends SIGTERM. ThreadPoolExecutor will not reap child + processes spawned by its workers, and each worker bash script in turn + spawns `python3 scale.py execute` which spawns a docker container — so + abrupt parent death without explicit teardown can leave orphan docker + containers running. We best-effort terminate the bash workers; the docker + container behind them will exit when its parent python child exits. + """ + def _terminate_all(signum, _frame): + with _PARALLEL_STDOUT_LOCK: + sys.stdout.write( + f"[execute-parallel] received signal {signum}, " + "terminating live workers\n" + ) + sys.stdout.flush() + with _PARALLEL_LIVE_POPENS_LOCK: + for proc in list(_PARALLEL_LIVE_POPENS): + try: + proc.terminate() + except Exception: # pylint: disable=broad-except + pass + # Re-raise default behavior for the original signal so the parent + # exits with the conventional code (128+signum). This also unblocks + # any executor.shutdown(wait=True) waiters. + signal.signal(signum, signal.SIG_DFL) + os.kill(os.getpid(), signum) + + signal.signal(signal.SIGINT, _terminate_all) + signal.signal(signal.SIGTERM, _terminate_all) + + +def execute_parallel( + clusters_file, + max_concurrent, + worker_script, + cl2_image, + cl2_config_dir, + cl2_config_file, + cl2_report_dir_base, + provider, + python_script_file, + python_workdir, + tear_down_prometheus=False, +): + """Fan out CL2 across N clusters with bounded concurrency. + + Each cluster's CL2 + log capture + failure diag runs in its own bash + worker process (run-cl2-on-cluster.sh). At most `max_concurrent` run + in parallel. Per-cluster log capture happens IMMEDIATELY when that + cluster's CL2 finishes — before peer clusters complete — so kubectl + --tail windows and `kubectl get events` recency don't age out. + + The worker script's exit code is the authoritative per-cluster + pass/fail (it does its own junit gate). This function aggregates: + returns 0 iff every worker exited 0; otherwise 1. Matches the + sequential `if failures > 0; exit 1` semantics that execute.yml had + before parallelization, so the AzDO step's pass/fail signal is + unchanged from the user's perspective. + + `clusters_file` schema: a JSON array of objects with at least `role` + and `kubeconfig` fields. Extra fields (e.g. `name`, `rg`) are ignored + so the same JSON file produced by execute.yml's discovery step (which + also feeds collect.yml) can be reused without a separate write. + + Known concurrency risk: `run_cl2_command` mounts `~/.azure` rw into + every CL2 docker container (utils.py:69-70). At max_concurrent > 1 + those containers concurrently read/write the MSAL token cache. If + this causes auth flakes on real 5/10/20-cluster runs, isolate per + worker (TODO Phase 3 follow-up). + """ + with open(clusters_file, "r", encoding="utf-8") as f: + clusters = json.load(f) + if not isinstance(clusters, list) or not clusters: + raise ValueError( + f"clusters file {clusters_file} must be a non-empty JSON array" + ) + + # Validate up front so we fail fast before spawning anything. + for idx, c in enumerate(clusters): + if "role" not in c or "kubeconfig" not in c: + raise ValueError( + f"clusters[{idx}] missing 'role' or 'kubeconfig': {c}" + ) + + if max_concurrent < 1: + raise ValueError(f"max_concurrent must be >= 1, got {max_concurrent}") + + _install_parallel_signal_handlers() + + print( + f"[execute-parallel] dispatching {len(clusters)} cluster(s) " + f"with max_concurrent={max_concurrent}", + flush=True, + ) + + results = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=max_concurrent + ) as executor: + futures = {} + for c in clusters: + role = c["role"] + kubeconfig = c["kubeconfig"] + report_dir = os.path.join(cl2_report_dir_base, role) + worker_args = [ + kubeconfig, + report_dir, + cl2_image, + cl2_config_dir, + cl2_config_file, + provider, + python_script_file, + python_workdir, + # Last positional: 1 = tear down Prometheus at end of CL2 (used + # by share-infra mode so the next scenario's CL2 deploys a + # fresh Prom); 0 = preserve Prom for failure-diagnostic dump. + "1" if tear_down_prometheus else "0", + ] + fut = executor.submit( + _run_one_cluster, role, worker_script, worker_args + ) + futures[fut] = role + + for fut in concurrent.futures.as_completed(futures): + role = futures[fut] + try: + _, exit_code = fut.result() + except Exception as e: # pylint: disable=broad-except + # Worker raised before producing an exit code (e.g. could not + # spawn bash). Treat as a failure for that cluster — surface + # the error and continue collecting peers. + print( + f"[execute-parallel] {role}: worker raised: {e}", + flush=True, + ) + results.append((role, 1)) + else: + results.append((role, exit_code)) + + failed = [r for r, code in results if code != 0] + succeeded = [r for r, code in results if code == 0] + print( + f"[execute-parallel] summary: {len(succeeded)} succeeded, " + f"{len(failed)} failed (max_concurrent={max_concurrent})", + flush=True, + ) + if failed: + print( + f"[execute-parallel] failed clusters: {', '.join(sorted(failed))}", + flush=True, + ) + return 1 + return 0 + + def collect_clusterloader2( cl2_report_dir, cloud_info, @@ -112,6 +487,14 @@ def collect_clusterloader2( deployments_per_namespace, replicas_per_deployment, trigger_reason="", + churn_cycles=0, + churn_up_duration="", + churn_down_duration="", + kill_duration_seconds=0, + kill_interval_seconds=0, + kill_batch=0, + saturation_qps_list="", + saturation_restarts_list="", ): details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent=2) json_data = json.loads(details) @@ -145,6 +528,17 @@ def collect_clusterloader2( "deployments_per_namespace": deployments_per_namespace, "replicas_per_deployment": replicas_per_deployment, "pods_per_cluster": namespaces * deployments_per_namespace * replicas_per_deployment, + # Phase 4a — pod-churn knobs. Defaults are 0/"" for non-churn + # test_types so existing Kusto queries that don't reference + # these fields stay valid. For pod-churn runs these record the + # exact stressor parameters so historical comparisons survive + # default changes. + "churn_cycles": churn_cycles, + "churn_up_duration": churn_up_duration, + "churn_down_duration": churn_down_duration, + "kill_duration_seconds": kill_duration_seconds, + "kill_interval_seconds": kill_interval_seconds, + "kill_batch": kill_batch, "details": ( testsuites[0]["testcases"][0].get("failure", None) if testsuites[0].get("testcases") @@ -163,13 +557,720 @@ def collect_clusterloader2( "namespaces": namespaces, "deployments_per_namespace": deployments_per_namespace, "replicas_per_deployment": replicas_per_deployment, + "churn_cycles": churn_cycles, + "kill_duration_seconds": kill_duration_seconds, + "kill_interval_seconds": kill_interval_seconds, + "kill_batch": kill_batch, } - content = process_cl2_reports(cl2_report_dir, template) + # Shared process_cl2_reports() does an unconditional open() on every + # entry of cl2_report_dir, which raises IsADirectoryError on any subdir. + # Today the only subdir is logs/ (created by run-cl2-on-cluster.sh for + # pod-log capture), but we stash ANY subdir so future additions (new + # diag dumps, CL2 version bump emitting per-phase subdirs, etc.) don't + # silently regress. Subdirs are relocated OUTSIDE cl2_report_dir for + # the duration of the parse and restored in a finally block — they + # must end up back inside cl2_report_dir so the pipeline-level + # artifact publish picks them up alongside junit.xml. + stash_root = None + stashed_entries = [] + for entry in os.listdir(cl2_report_dir): + if os.path.isdir(os.path.join(cl2_report_dir, entry)): + if stash_root is None: + stash_root = tempfile.mkdtemp(prefix="cl2-report-stash-") + os.rename( + os.path.join(cl2_report_dir, entry), + os.path.join(stash_root, entry), + ) + stashed_entries.append(entry) + try: + content = process_cl2_reports(cl2_report_dir, template) + finally: + if stash_root: + for entry in stashed_entries: + src = os.path.join(stash_root, entry) + if os.path.isdir(src): + os.rename(src, os.path.join(cl2_report_dir, entry)) + if not os.listdir(stash_root): + os.rmdir(stash_root) os.makedirs(os.path.dirname(result_file), exist_ok=True) with open(result_file, "w", encoding="utf-8") as f: f.write(content) + # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) timing pickup. + # apiserver-failure-killer.sh writes ApiserverFailureTimings_.json + # at the target cluster's report dir with t0/t1/duration. Non-target + # clusters skip writing the file. process_cl2_reports() doesn't recognize + # this file pattern, so we emit the row explicitly here. One row per + # timing file (always exactly one — only the target cluster writes one). + _emit_apiserver_failure_timing_rows(cl2_report_dir, template, result_file) + + # Phase 4b — Scenario #7 (HA Configuration Validation) scaling pickup. + # ha-config-scaler.sh writes HAConfigScalingTimings_.json on + # EVERY cluster (not just the kill target) — HA scaling is mesh-wide. + # One row per cluster. + _emit_ha_config_scaling_rows(cl2_report_dir, template, result_file) + + # Phase 4b — Scenario #3 (Node Churn / IP Churn) timing pickup. + # node-churner.sh writes NodeChurnTimings_.json into the + # TARGET cluster's per-cluster report dir (the churner runs from + # execute.yml on the AzDO agent, not inside CL2 — see plan.md scenario #3 + # design). One row per recorded op (scale_up / scale_down / replace_drain / + # replace_delete / replace_wait). Non-target clusters skip writing the + # file → no rows emitted for them. + _emit_node_churn_timing_rows(cl2_report_dir, template, result_file) + + # Phase 4b — Scenario #6 (Upper Bound / Saturation) classifier rows. + # Reads per-rung GenericPrometheusQuery output JSONs (one per measurement + # × rung; CL2 emits them with the rung's suffix in the Identifier and + # filename), applies the saturation classifier to each rung, and emits + # one SaturationRung row per rung + one SaturationSummary row per + # cluster. No-op when saturation_qps_list is empty (i.e. not an + # upper-bound test_type) so non-saturation scenarios pay zero overhead. + _emit_saturation_profile_rows( + cl2_report_dir, template, result_file, + saturation_qps_list, saturation_restarts_list, + ) + + +def _emit_saturation_profile_rows( + cl2_report_dir, template, result_file, + saturation_qps_list, saturation_restarts_list, +): + """Append SaturationRung + SaturationSummary JSONL rows. + + Reads per-rung GenericPrometheusQuery output JSONs (CL2-emitted, format + {"version": "v1", "dataItems": [{"labels": {"Metric": }, + "data": {"value": }}, ...]}) and applies the classifier. + + Args: + cl2_report_dir: per-cluster report directory. + template: row template (cluster/mesh_size/etc. already filled in). + result_file: per-cluster JSONL output path (appended). + saturation_qps_list: comma-separated QPS values, one per rung. + Empty string → not an upper-bound run → no-op. + saturation_restarts_list: comma-separated restart counts, one per + rung. Length must match qps_list; if not, + missing entries default to 1. + + Emitted rows (one per rung + one per cluster summary): + SaturationRung: { + "rung_index": int, + "configured_qps": int, + "configured_restarts": int, + "classifier_version": str, + "thresholds": {: float}, + "verdict": str, # clean | latency_spike | queue_unbounded | + # cpu_exhaust | mesh_failure_burst | etcd_tail + "dominant_signal_ratio": float, + "rung_completed": bool, + "measurement_missing": [str], + "signals": {: float|None}, + "all_verdicts": {: float}, # ratio observed/threshold + } + SaturationSummary: { + "rungs_configured": int, + "rungs_completed": int, + "max_clean_qps": int|None, # highest QPS in contiguous clean prefix + "first_failure_rung_index": int|None, + "first_failure_qps": int|None, + "first_failure_mode": str|None, + "second_failure_mode": str|None, + "classifier_version": str, + } + """ + if not saturation_qps_list: + return # Not an upper-bound run; no-op. + try: + qps_list = [int(x) for x in saturation_qps_list.split(",") if x.strip()] + except ValueError as e: + print( + f"[collect] WARN: malformed saturation_qps_list " + f"{saturation_qps_list!r}: {e}; skipping saturation classifier", + file=sys.stderr, + ) + return + if not qps_list: + return + try: + restarts_list = [ + int(x) for x in (saturation_restarts_list or "").split(",") + if x.strip() + ] + except ValueError: + restarts_list = [] + # Pad/truncate restarts_list to match qps_list length. Missing entries + # default to 1 (the smallest meaningful restart count). Excess entries + # are ignored. + while len(restarts_list) < len(qps_list): + restarts_list.append(1) + restarts_list = restarts_list[: len(qps_list)] + + if not os.path.isdir(cl2_report_dir): + print( + f"[collect] WARN: saturation classifier: report dir " + f"{cl2_report_dir} does not exist", + file=sys.stderr, + ) + return + all_files = os.listdir(cl2_report_dir) + + # Proactive debug: dump the full list of rung-suffixed measurement files + # so postmortem doesn't depend on the AzDO step's stdout being preserved. + # User direction 2026-05-14: assume failure, keep debug logs baked in + # until n=2 + n=20 are green; strip after. + # + # Match BOTH filename conventions: + # prod: "GenericPrometheusQuery Rung__.json" + # (space between method and metricName; verified build 67211) + # compact: "GenericPrometheusQuery_Rung__.json" + # (no spaces; legacy mock convention) + # Pre-fix (build 67221) the diagnostic counted only compact-form files, + # so we'd see "0 found" even when files DID land via prod-form (the + # _find_file lookup correctly accepts both, but the diagnostic was + # misleading). Fix: count any GenericPrometheusQuery*.json with Rung + # in the name. + rung_files_seen = sorted([ + f for f in all_files + if f.startswith("GenericPrometheusQuery") + and "Rung" in f + and f.endswith(".json") + ]) + print( + f"[collect] saturation: classifier starting for " + f"qps_list={qps_list} restarts_list={restarts_list}", + file=sys.stderr, + ) + print( + f"[collect] saturation: cl2_report_dir={cl2_report_dir} " + f"total_files_in_dir={len(all_files)} " + f"rung_files_matching_pattern={len(rung_files_seen)}", + file=sys.stderr, + ) + # Print ALL files (not just rung ones) so if the prefix matcher has any + # encoding/whitespace surprise, the raw listing reveals it. + for fname in all_files[:30]: + print(f"[collect] saturation: listdir: {fname!r}", file=sys.stderr) + if len(all_files) > 30: + print( + f"[collect] saturation: ... and {len(all_files) - 30} more", + file=sys.stderr, + ) + + def _read_metric(filepath, metric_label): + """Return the numeric `value` for a given Metric label, or None. + + Supports BOTH known CL2 dataItem shapes: + + (A) CL2 GenericPrometheusQuery — one dataItem with all query + results as named keys in `data` (verified against build 67224): + {"dataItems": [{"data": {"Max": 0, "Perc99": 0.5}, "unit": "#"}]} + The metric_label is the query name from the YAML + (Max / Perc50 / Perc99 / etc.) and is looked up directly as a + dict key inside item.data. + + (B) Legacy / PodStartupLatency-style — one dataItem per metric, + with labels.Metric naming the metric and data.value holding + the number: + {"dataItems": [ + {"labels": {"Metric": "Perc99"}, "data": {"value": 0.5}} + ]} + + Returns the first match across all dataItems. None if the label + isn't present in any item or the file can't be parsed. + """ + try: + with open(filepath, "r", encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError) as e: + print( + f"[collect] WARN: failed to read {filepath}: {e}", + file=sys.stderr, + ) + return None + for item in data.get("dataItems", []) or []: + item_data = item.get("data") or {} + # Format A: query name (e.g. "Perc99") is a direct key in + # item.data. The value is the scalar number (not a {"value": N} + # wrapper). Skip dict-valued entries so we don't accidentally + # match a legacy nested structure. + if metric_label in item_data and not isinstance( + item_data[metric_label], (dict, list) + ): + val = item_data[metric_label] + if val is None or val == "": + return None + try: + return float(val) + except (TypeError, ValueError): + return None + # Format B: labels.Metric carries the query name, data.value + # carries the scalar number. Backward-compatible with existing + # mock fixtures (PodStartupLatency mock_data). + labels = item.get("labels") or {} + if labels.get("Metric") == metric_label: + val = item_data.get("value") + if val is None or val == "": + return None + try: + return float(val) + except (TypeError, ValueError): + return None + return None + + def _find_file(rung_suffix, metric_name_prefix): + """Locate the CL2-emitted JSON for a given metricName prefix and + rung suffix. CL2's actual file pattern (verified against build 67211) + is: + GenericPrometheusQuery __.json + + e.g. for metricName "ClusterMesh Kvstore Sync Queue Size {{$suffix}}" + with suffix=Rung0: + GenericPrometheusQuery ClusterMesh Kvstore Sync Queue Size Rung0_clustermesh-upper-bound_2026-05-15T02:20:27Z.json + + We match on the production format primarily, with a fallback to the + compact-no-space underscore format + GenericPrometheusQuery___.json + for backward compat with mock fixtures + any other CL2 versions + that strip spaces. + """ + # Production format (build 67211 confirmed): space-separated, suffix + # immediately follows metric name with a space (because the YAML + # template `metricName: {{$suffix}}` keeps the space). + prod_target = f"GenericPrometheusQuery {metric_name_prefix} {rung_suffix}_" + # Mock/compact fallback: drop spaces, no leading space after method. + compact_metric = metric_name_prefix.replace(" ", "") + compact_target = f"GenericPrometheusQuery_{compact_metric}{rung_suffix}_" + matches = [ + f for f in all_files + if (f.startswith(prod_target) or f.startswith(compact_target)) + and f.endswith(".json") + ] + if matches: + return os.path.join(cl2_report_dir, matches[0]) + return None + + # Signal name → (metricName-from-YAML, metric-label, transform). + # The metricName is the YAML's `metricName:` field text (space-separated), + # which is what CL2 embeds in the emitted filename. Build 67211 verified + # the production filename pattern. + # + # Transform converts the measurement's native unit into the classifier's + # threshold unit (seconds → milliseconds where applicable). + signal_map = { + "latency_p99_ms": ( + "ClusterMesh Kvstore Operation Duration", "Perc99", + lambda v: v * 1000.0, + ), + "queue_size_perc99": ( + "ClusterMesh Kvstore Sync Queue Size", "Perc99", + lambda v: v, + ), + "queue_size_max": ( + "ClusterMesh Kvstore Sync Queue Size", "Max", + lambda v: v, + ), + "apiserver_max_cpu_cores": ( + "ClusterMesh APIServer Pod CPU", "PerPodMax", + lambda v: v, + ), + "mesh_failure_rate_max": ( + "ClusterMesh Remote Cluster Failure Rate", "Max", + lambda v: v, + ), + "etcd_commit_p99_ms": ( + "ClusterMesh Etcd Backend Write Duration", "Perc99", + lambda v: v * 1000.0, + ), + "observed_event_rate_p99": ( + "ClusterMesh Kvstore Events Rate", "Perc99", + lambda v: v, + ), + } + # Criterion → signal-name driving the verdict. Each criterion's ratio + # is observed/threshold; ≥1.0 = tripped. Dominant criterion = the + # tripped one with the highest ratio. + criteria = { + "latency_spike": "latency_p99_ms", + "queue_unbounded": "queue_size_perc99", + "cpu_exhaust": "apiserver_max_cpu_cores", + "mesh_failure_burst": "mesh_failure_rate_max", + "etcd_tail": "etcd_commit_p99_ms", + } + + rungs_completed = 0 + first_failure_index = None + first_failure_qps = None + first_failure_mode = None + second_failure_mode = None + max_clean_qps = None + clean_streak_broken = False + + with open(result_file, "a", encoding="utf-8") as out: + for rung_idx, qps in enumerate(qps_list): + suffix = f"Rung{rung_idx}" + restarts = restarts_list[rung_idx] + + signals = {} + measurement_missing = [] + for sig_name, (ident, metric_label, transform) in signal_map.items(): + fpath = _find_file(suffix, ident) + if fpath is None: + signals[sig_name] = None + measurement_missing.append(sig_name) + continue + raw = _read_metric(fpath, metric_label) + if raw is None: + signals[sig_name] = None + measurement_missing.append(sig_name) + else: + signals[sig_name] = transform(raw) + + # Rung "completed" iff at least one signal landed AND the + # latency signal landed (proxy for "the rung executed and CL2 + # gathered measurements for it"). Tuned conservatively so a + # half-collected rung is flagged for re-investigation rather + # than silently summarized. + rung_completed = ( + signals.get("latency_p99_ms") is not None + and len(measurement_missing) < len(signal_map) + ) + if rung_completed: + rungs_completed += 1 + + # Compute per-criterion ratios. None signals = criterion + # skipped (cannot contribute to verdict). + all_verdicts = {} + for criterion, sig_name in criteria.items(): + v = signals.get(sig_name) + if v is None: + continue + threshold = SATURATION_THRESHOLDS[ + sig_name if sig_name in SATURATION_THRESHOLDS + else "latency_p99_ms" # never hits — defensive + ] + if threshold <= 0: + continue + all_verdicts[criterion] = v / threshold + + tripped = {c: r for c, r in all_verdicts.items() if r >= 1.0} + if tripped: + verdict = max(tripped, key=tripped.get) + dominant_ratio = tripped[verdict] + elif (not rung_completed and rungs_completed > 0): + # Phase 4b — Scenario #6 monitoring_oom verdict (added + # 2026-05-15 after build 67279 showed Prometheus crashed + # mid-run at Rung 2-3, losing all measurements for those + # rungs). When an earlier rung completed but the current + # rung's measurements all came back empty, the most likely + # explanation is that the monitoring stack (Prometheus + # pod) ran out of memory / went CrashLoopBackOff under + # the elevated workload pressure of the higher rung. + # That IS a saturation finding per spec line 113 + # ("Resource exhaustion occurs") — record it as a real + # verdict instead of silently leaving the rung as + # verdict=clean rung_completed=False which underclaims + # the failure. + # + # Synthetic dominant_signal_ratio=999.0 so dashboards + # ordering verdicts by severity rank this above other + # tripped criteria. The actual signal that drove the + # OOM (CPU, memory, query queue, cardinality explosion) + # is NOT distinguishable from blob output alone — needs + # Prom pod logs to triage. + verdict = "monitoring_oom" + dominant_ratio = 999.0 + else: + verdict = "clean" + dominant_ratio = max(all_verdicts.values()) if all_verdicts else 0.0 + + # Track per-cluster summary fields. max_clean_qps is the + # highest qps in a CONTIGUOUS clean+completed prefix — once + # a non-clean rung lands we stop extending it (a brief + # later-rung "false clean" shouldn't disqualify the genuine + # earlier failure). + if verdict == "clean" and rung_completed and not clean_streak_broken: + if max_clean_qps is None or qps > max_clean_qps: + max_clean_qps = qps + else: + clean_streak_broken = True + if verdict != "clean": + if first_failure_index is None: + first_failure_index = rung_idx + first_failure_qps = qps + first_failure_mode = verdict + elif (second_failure_mode is None + and verdict != first_failure_mode): + second_failure_mode = verdict + + rung_row = json.loads(json.dumps(template)) + rung_row["measurement"] = "SaturationRung" + rung_row["group"] = "upper-bound" + rung_row["result"] = { + "data": { + "rung_index": rung_idx, + "configured_qps": qps, + "configured_restarts": restarts, + "classifier_version": SATURATION_CLASSIFIER_VERSION, + "thresholds": SATURATION_THRESHOLDS, + "verdict": verdict, + "dominant_signal_ratio": dominant_ratio, + "rung_completed": rung_completed, + "measurement_missing": measurement_missing, + "signals": signals, + "all_verdicts": all_verdicts, + }, + "unit": "verdict", + } + out.write(json.dumps(rung_row) + "\n") + + # Per-rung stderr summary: greppable line for AzDO postmortem + # ("collect saturation rung=2 verdict=queue_unbounded ratio=5.0"). + # Counts signals found out of expected so partial rungs surface. + print( + f"[collect] saturation: rung={rung_idx} qps={qps} " + f"restarts={restarts} verdict={verdict} " + f"dominant_ratio={dominant_ratio:.3f} " + f"completed={rung_completed} " + f"signals_found={len(signal_map) - len(measurement_missing)}/{len(signal_map)} " + f"missing={measurement_missing}", + file=sys.stderr, + ) + + summary_row = json.loads(json.dumps(template)) + summary_row["measurement"] = "SaturationSummary" + summary_row["group"] = "upper-bound" + summary_row["result"] = { + "data": { + "rungs_configured": len(qps_list), + "rungs_completed": rungs_completed, + "max_clean_qps": max_clean_qps, + "first_failure_rung_index": first_failure_index, + "first_failure_qps": first_failure_qps, + "first_failure_mode": first_failure_mode, + "second_failure_mode": second_failure_mode, + "configured_qps_list": qps_list, + "configured_restarts_list": restarts_list, + "classifier_version": SATURATION_CLASSIFIER_VERSION, + "thresholds": SATURATION_THRESHOLDS, + }, + "unit": "verdict", + } + out.write(json.dumps(summary_row) + "\n") + + # Stderr summary for AzDO postmortem; greppable headline line. + print( + f"[collect] saturation: SUMMARY rungs_completed={rungs_completed}/{len(qps_list)} " + f"max_clean_qps={max_clean_qps} " + f"first_failure_qps={first_failure_qps} " + f"first_failure_mode={first_failure_mode} " + f"second_failure_mode={second_failure_mode} " + f"classifier_version={SATURATION_CLASSIFIER_VERSION}", + file=sys.stderr, + ) + + +def _emit_node_churn_timing_rows(cl2_report_dir, template, result_file): + """Append one JSONL row per recorded op in NodeChurnTimings_*.json. + + File shape (from node-churner.sh): + { + "target_context": str, + "target_cluster_name": str, + "target_resource_group": str, + "target_nodepool": str, + "scenario": "node-churn-scale" | "node-churn-replace" | "node-churn-combined", + "original_node_count": int, + "ready_quorum_reached": bool, + "cleanup_failed": bool, + "scenario_valid": bool, // false if a circuit-breaker fired + "truncated": bool, // true if churner ran past CL2 sleep + "started_epoch": int, + "ended_epoch": int, + "duration_seconds": int, + "ops": [ + { + "op_index": int, + "op_type": "scale_up"|"scale_down"|"replace_drain"|"replace_delete"|"replace_refill"|"replace_wait", + "start_epoch": int, + "end_epoch": int, + "duration_seconds": int, + "succeeded": bool, + "observed_node_count": int, + "pre_ip_set": [str], // only populated on replace_wait + "post_ip_set": [str], + "pre_node_names": [str], // only populated on replace_wait + "post_node_names": [str], + "new_ip_count": int, // INFORMATIONAL — Azure VNet allocator + // reuses freed IPs immediately so this + // may be 0 even after successful replacement + "new_node_count": int, // AUTHORITATIVE replacement signal — + // VMSS instance IDs are monotonic so node + // names always differ after replacement + "error": str // empty on success + }, ... + ] + } + + Each op becomes one row in the JSONL with + measurement="NodeChurnOpTiming", group=, and result.data = the + per-op JSON, PLUS scenario-level fields copied onto result.data for + cross-row context (scenario_valid, cleanup_failed, truncated, etc.). + A scenario-level summary row with measurement="NodeChurnSummary" is also + emitted so Kusto queries can detect cleanup_failed / scenario_valid=false + runs without joining op rows. One summary row per timing file. + """ + timing_files = [ + f for f in os.listdir(cl2_report_dir) + if f.startswith("NodeChurnTimings_") and f.endswith(".json") + ] + if not timing_files: + return + scenario_level_keys = ( + "scenario", "target_context", "target_cluster_name", + "target_resource_group", "target_nodepool", + "original_node_count", "ready_quorum_reached", "cleanup_failed", + "scenario_valid", "truncated", "started_epoch", "ended_epoch", + "duration_seconds", + ) + with open(result_file, "a", encoding="utf-8") as out: + for tf in timing_files: + tf_path = os.path.join(cl2_report_dir, tf) + try: + with open(tf_path, "r", encoding="utf-8") as tfh: + timing_data = json.load(tfh) + except (OSError, json.JSONDecodeError) as e: + print( + f"[collect] WARN: failed to read {tf_path}: {e}", + file=sys.stderr, + ) + continue + scenario_context = { + k: timing_data.get(k) for k in scenario_level_keys + } + # One summary row per file — always emitted, even if ops list is + # empty (e.g., quorum never reached → churner aborted before any op). + summary_row = json.loads(json.dumps(template)) + summary_row["measurement"] = "NodeChurnSummary" + summary_row["group"] = timing_data.get("scenario", "node-churn") + summary_row["result"] = { + "data": { + **scenario_context, + "op_count": len(timing_data.get("ops") or []), + }, + "unit": "seconds", + } + out.write(json.dumps(summary_row) + "\n") + # One row per op, with scenario_context merged onto result.data so + # a single Kusto filter (e.g., scenario_valid=true) gates op-level + # analysis without needing a join. + for op in timing_data.get("ops") or []: + op_row = json.loads(json.dumps(template)) + op_row["measurement"] = "NodeChurnOpTiming" + op_row["group"] = timing_data.get("scenario", "node-churn") + op_row["result"] = { + "data": {**scenario_context, **op}, + "unit": "seconds", + } + out.write(json.dumps(op_row) + "\n") + + +def _emit_apiserver_failure_timing_rows(cl2_report_dir, template, result_file): + """Append one JSONL row per ApiserverFailureTimings_*.json found. + + The timing file shape (from apiserver-failure-killer.sh): + { + "target_context": str, + "t0_kill_epoch": int, + "t1_recovered_epoch": int, + "recovery_duration_seconds": int, + "recovered": bool, + "killed_pod_name": str, + "killed_pod_uid": str, + "replacement_pod_uid": str, + "note": str + } + + Each timing file becomes one row in the JSONL with + measurement="ApiserverFailureRecoveryTiming", group="apiserver-failure", + and result.data = the timing JSON. Downstream Kusto queries can filter + on this measurement name to get per-run recovery timings keyed by + test_type=apiserver-failure + cluster. + """ + timing_files = [ + f for f in os.listdir(cl2_report_dir) + if f.startswith("ApiserverFailureTimings_") and f.endswith(".json") + ] + if not timing_files: + return + with open(result_file, "a", encoding="utf-8") as out: + for tf in timing_files: + tf_path = os.path.join(cl2_report_dir, tf) + try: + with open(tf_path, "r", encoding="utf-8") as tfh: + timing_data = json.load(tfh) + except (OSError, json.JSONDecodeError) as e: + print( + f"[collect] WARN: failed to read {tf_path}: {e}", + file=sys.stderr, + ) + continue + # Deep-copy template so we don't mutate the shared dict for any + # downstream caller. + row = json.loads(json.dumps(template)) + row["measurement"] = "ApiserverFailureRecoveryTiming" + row["group"] = "apiserver-failure" + row["result"] = {"data": timing_data, "unit": "seconds"} + out.write(json.dumps(row) + "\n") + + +def _emit_ha_config_scaling_rows(cl2_report_dir, template, result_file): + """Append one JSONL row per HAConfigScalingTimings_*.json found. + + The scaling file shape (from ha-config-scaler.sh): + { + "context": str, + "action": "scale-up" | "scale-down", + "requested_replicas": int, + "spec_replicas_after": int, + "ready_replicas_after": int, + "ha_replicas_honored": bool, + "scale_duration_seconds": int, + "note": str + } + + Each file becomes one row in the JSONL with + measurement="HAConfigScalingTiming", group="ha-config", and + result.data = the scaling JSON. Only scale-up emits a file; scale-down + is best-effort cleanup that does NOT overwrite the scale-up file. + Downstream Kusto queries can filter on measurement="HAConfigScalingTiming" + and ha_replicas_honored=true to scope HA A/B comparisons to runs where + the scale actually stuck (ENO operator did not revert). + """ + timing_files = [ + f for f in os.listdir(cl2_report_dir) + if f.startswith("HAConfigScalingTimings_") and f.endswith(".json") + ] + if not timing_files: + return + with open(result_file, "a", encoding="utf-8") as out: + for tf in timing_files: + tf_path = os.path.join(cl2_report_dir, tf) + try: + with open(tf_path, "r", encoding="utf-8") as tfh: + scaling_data = json.load(tfh) + except (OSError, json.JSONDecodeError) as e: + print( + f"[collect] WARN: failed to read {tf_path}: {e}", + file=sys.stderr, + ) + continue + row = json.loads(json.dumps(template)) + row["measurement"] = "HAConfigScalingTiming" + row["group"] = "ha-config" + row["result"] = {"data": scaling_data, "unit": "seconds"} + out.write(json.dumps(row) + "\n") + def main(): parser = argparse.ArgumentParser(description="ClusterMesh scale-test harness.") @@ -183,6 +1284,132 @@ def main(): pc.add_argument("--operation-timeout", type=str, default="15m") pc.add_argument("--cl2_override_file", type=str, required=True, help="Path to the overrides of CL2 config file") + # Phase 4a — Scenario #2 (Pod Churn Stress) knobs. Defaults match the + # pipeline matrix defaults so a configure invocation that doesn't pass + # these still writes valid overrides for both pod-churn-scale.yaml and + # pod-churn-kill.yaml. + pc.add_argument("--churn-cycles", type=int, default=5, + help="Number of scale-up/down cycles (pod-churn-scale).") + pc.add_argument("--churn-up-duration", type=str, default="60s", + help="Sleep between scale-up and next scale-down (pod-churn-scale).") + pc.add_argument("--churn-down-duration", type=str, default="60s", + help="Sleep between scale-down and next scale-up (pod-churn-scale).") + pc.add_argument("--kill-duration", type=str, default="10m", + help="Total kill-loop duration as a human string (logged only). " + "The runtime is bounded by --kill-duration-seconds.") + pc.add_argument("--kill-interval-seconds", type=int, default=10, + help="Seconds between successive kill rounds (pod-churn-kill).") + pc.add_argument("--kill-batch", type=int, default=5, + help="Pods deleted per round (pod-churn-kill).") + pc.add_argument("--kill-duration-seconds", type=int, default=600, + help="Killer Job script runtime in seconds (pod-churn-kill).") + pc.add_argument("--kill-job-deadline-seconds", type=int, default=660, + help="Killer Job activeDeadlineSeconds — defense-in-depth bound, " + "should be kill_duration_seconds plus a small buffer.") + # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs. + pc.add_argument("--apiserver-kill-target-context", type=str, default="clustermesh-1", + help="kubectl context name of the cluster whose clustermesh-apiserver " + "to kill. Other clusters no-op (per-cluster CL2 with shared overrides).") + pc.add_argument("--apiserver-kill-recovery-timeout-seconds", type=int, default=240, + help="How long to wait for the replacement clustermesh-apiserver pod " + "to reach Ready after kill. AKS-managed Cilium can take " + "120-180s in our observed runs (image pull + ENI attach); " + "240s gives headroom. Killer fails soft on timeout — writes " + "timing JSON with recovered:false instead of erroring.") + pc.add_argument("--apiserver-kill-observation-seconds", type=int, default=60, + help="Sleep duration AFTER the kill returns, before measurement gather. " + "Lets peer clusters' Prometheus scrape the failure window and " + "the post-recovery backlog drain.") + # Phase 4b — Scenario #7 (HA Configuration Validation) knob. + pc.add_argument("--ha-config-replicas", type=int, default=3, + help="Target replicas count for clustermesh-apiserver Deployment " + "during the ha-config scenario. Each cluster scales its own " + "Deployment to this count before measurements start, then back " + "to 1 after gather. Default 3 (standard k8s HA, etcd quorum-friendly).") + # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs. + # CL2 templates that don't reference these silently ignore (same pattern + # as the apiserver / ha-config knobs). node-churner.sh consumes them via + # matrix-exported env vars in execute.yml — NOT via these overrides. + pc.add_argument("--node-churn-target-context", type=str, default="clustermesh-1", + help="kubectl context name of the cluster whose default nodepool " + "is scaled / replaced. Other clusters observe via CL2. " + "Reuses the apiserver-failure target convention.") + pc.add_argument("--node-churn-cycles", type=int, default=3, + help="Number of scale-up/down cycles in node-churn-scale. " + "Each cycle does ONE scale-up by --node-churn-delta then ONE " + "scale-down by the same delta with --node-churn-settle-seconds " + "between ops. 3 cycles × 2 ops × ~4min/op = ~24min wall.") + pc.add_argument("--node-churn-delta", type=int, default=5, + help="Per-half-cycle scale delta. +N on scale-up, -N on scale-down. " + "Default 5 → 20→25→20 cycles. Bounded above by AKS vCPU quota.") + pc.add_argument("--node-churn-settle-seconds", type=int, default=60, + help="Sleep between consecutive nodepool ops to let cilium " + "reconcile node identities + endpoints before next op.") + pc.add_argument("--node-churn-scale-duration-seconds", type=int, default=1800, + help="CL2-side sleep window for node-churn-scale.yaml. Must be " + "≥ expected churner wall time + settle margin. 1800s = 30min " + "covers 3-cycle scale at ~24min churner wall.") + pc.add_argument("--node-churn-replace-duration-seconds", type=int, default=1500, + help="CL2-side sleep window for node-churn-replace.yaml. " + "1500s = 25min covers VMSS-delete-and-replace of ~10 instances " + "in parallel (each drain+replace ~5-10min, parallelized).") + pc.add_argument("--node-churn-combined-duration-seconds", type=int, default=3300, + help="CL2-side sleep window for node-churn-combined.yaml " + "(scale phase + replace phase serially). Sum of the two " + "individual windows plus margin.") + pc.add_argument("--node-replace-batch-size", type=int, default=10, + help="Number of VMSS instances to drain+delete in the replace " + "scenario. AKS auto-replaces to restore the desired count, " + "yielding K new VMs with new IPs. 10 of 20 default nodes = " + "50%% pool replacement; bounded above by --max-surge fraction " + "Cilium can tolerate without endpoint floods saturating the mesh.") + pc.add_argument("--node-churn-ready-timeout-seconds", type=int, default=300, + help="How long node-churner.sh waits for per-cluster CL2 ready " + "sentinels before starting the first nodepool op. If quorum " + "(all clusters' sentinels) isn't reached within this window, " + "the churner aborts WITH cleanup (restores pool to original " + "node count) and marks scenario_valid=false in the timing JSON.") + # Phase 4b — Scenario #6 (Upper Bound / Saturation) knobs. + # Each upper-bound CL2 run sweeps through N rungs of progressively + # heavier load (QPS × restart count). The classifier in collect emits + # one SaturationRung row per rung tagging which signal tripped + # (clean | latency_spike | queue_unbounded | cpu_exhaust | + # mesh_failure_burst | etcd_tail). See SATURATION_THRESHOLDS at the + # top of this module + plan.md Scenario #6 section. + pc.add_argument("--saturation-qps-list", type=str, default="100,500,1500,4000,10000", + help="Comma-separated list of QPS values, one per saturation " + "rung. Length determines number of rungs; CL2's " + "upper-bound.yaml parses this via StringSplit. " + "Default is a 5-rung sweep (100, 500, 1500, 4000, 10000 " + "calls/sec) — bumped 2026-05-15 after build 67224 showed " + "all signals at 1-15%% of thresholds at the prior top rung " + "(qps=160, restarts=4). QPS above ~100 is effectively " + "uncapped for our 20-deployment workload (CL2 apply " + "throughput is the ceiling, not QPS itself); " + "saturation_restarts_list is the real load lever.") + pc.add_argument("--saturation-restarts-list", type=str, default="2,4,8,15,25", + help="Comma-separated list of restart counts, one per saturation " + "rung (length must match --saturation-qps-list). Each rung's " + "workload is restart-bursted this many times so cumulative " + "event volume scales with rung index even when CL2's " + "Deployment-apply QPS saturates. Restart count is the " + "primary load lever: each restart triggers ~200 pod recreates " + "(at n=2 with 200-pod workload), each emitting endpoint + " + "identity + service events through the mesh.") + pc.add_argument("--saturation-rung-duration-seconds", type=int, default=240, + help="Wall-clock duration each rung holds after its restart-burst " + "before measurements are gathered. Drives the per-rung " + "measurement window (CL2 substitutes %%v in queries with " + "wall time since the matching `start` action). Bumped " + "180s\u2192240s 2026-05-15 to give higher rungs time to " + "accumulate meaningful signal at the post-burst tail.") + pc.add_argument("--saturation-settle-seconds", type=int, default=90, + help="Sleep between rungs so kvstore queues from rung r drain " + "before rung r+1's measurement window opens. Insufficient " + "settle biases later rungs' verdicts toward `queue_unbounded` " + "even if the queue would have drained on its own. Bumped " + "60s\u219290s 2026-05-15 since higher restart bursts take " + "longer to fully drain queues.") # execute pe = subparsers.add_parser("execute", help="Run CL2 against a single cluster") @@ -192,6 +1419,39 @@ def main(): pe.add_argument("--cl2-config-file", type=str, required=True) pe.add_argument("--kubeconfig", type=str, required=True) pe.add_argument("--provider", type=str, required=True) + pe.add_argument("--tear-down-prometheus", action="store_true", + help="Tear down Prometheus stack at end of CL2 (set in share-infra " + "mode so the next scenario's CL2 can deploy a fresh Prom). " + "Default is to preserve Prom for failure-diagnostic dumping.") + + # execute-parallel — fan out CL2 across N clusters with bounded concurrency + pep = subparsers.add_parser( + "execute-parallel", + help="Run CL2 across multiple clusters with bounded concurrency", + ) + pep.add_argument("--clusters", type=str, required=True, + help="Path to JSON file containing array of cluster objects, " + "each with at least 'role' and 'kubeconfig' fields") + pep.add_argument("--max-concurrent", type=int, default=4, + help="Maximum number of CL2 invocations to run in parallel") + pep.add_argument("--worker-script", type=str, required=True, + help="Path to per-cluster bash worker (run-cl2-on-cluster.sh)") + pep.add_argument("--cl2-image", type=str, required=True) + pep.add_argument("--cl2-config-dir", type=str, required=True) + pep.add_argument("--cl2-config-file", type=str, required=True) + pep.add_argument("--cl2-report-dir-base", type=str, required=True, + help="Base directory; per-cluster reports land at //") + pep.add_argument("--provider", type=str, required=True) + pep.add_argument("--python-script-file", type=str, required=True, + help="Path to this scale.py — invoked by the worker script " + "via `python3 execute ...`") + pep.add_argument("--python-workdir", type=str, required=True, + help="Working dir for the nested python execute call " + "(typically modules/python so PYTHONPATH resolves)") + pep.add_argument("--tear-down-prometheus", action="store_true", + help="Pass through to each per-cluster CL2 invocation; used in " + "share-infra mode where multiple scenarios share infra and " + "each needs a clean Prometheus deploy.") # collect pco = subparsers.add_parser("collect", help="Collect results for one cluster") @@ -213,6 +1473,27 @@ def main(): pco.add_argument("--deployments-per-namespace", type=int, required=True) pco.add_argument("--replicas-per-deployment", type=int, required=True) pco.add_argument("--trigger_reason", type=str, default="") + # Phase 4a — pod-churn knobs recorded into the JSONL for historical + # comparison. Optional; default to 0/"" so non-churn test_types + # (event-throughput, default-config) don't need to set them. + pco.add_argument("--churn-cycles", type=int, default=0) + pco.add_argument("--churn-up-duration", type=str, default="") + pco.add_argument("--churn-down-duration", type=str, default="") + pco.add_argument("--kill-duration-seconds", type=int, default=0) + pco.add_argument("--kill-interval-seconds", type=int, default=0) + pco.add_argument("--kill-batch", type=int, default=0) + # Phase 4b — Scenario #6 (Upper Bound / Saturation) collect knobs. + # Optional; default to empty string so non-saturation test_types skip + # the classifier entirely (zero overhead). For upper-bound test_types, + # collect.yml plumbs the matrix-configured saturation_qps_list + + # saturation_restarts_list into these args so the classifier records + # the actual QPS and restart values that drove each rung. + pco.add_argument("--saturation-qps-list", type=str, default="", + help="Comma-separated QPS values from the upper-bound run. " + "Empty = not an upper-bound run; classifier is no-op.") + pco.add_argument("--saturation-restarts-list", type=str, default="", + help="Comma-separated restart counts from the upper-bound run " + "(length must match --saturation-qps-list).") args = parser.parse_args() @@ -223,6 +1504,31 @@ def main(): args.replicas_per_deployment, args.operation_timeout, args.cl2_override_file, + churn_cycles=args.churn_cycles, + churn_up_duration=args.churn_up_duration, + churn_down_duration=args.churn_down_duration, + kill_duration=args.kill_duration, + kill_interval_seconds=args.kill_interval_seconds, + kill_batch=args.kill_batch, + kill_duration_seconds=args.kill_duration_seconds, + kill_job_deadline_seconds=args.kill_job_deadline_seconds, + apiserver_kill_target_context=args.apiserver_kill_target_context, + apiserver_kill_recovery_timeout_seconds=args.apiserver_kill_recovery_timeout_seconds, + apiserver_kill_observation_seconds=args.apiserver_kill_observation_seconds, + ha_config_replicas=args.ha_config_replicas, + node_churn_target_context=args.node_churn_target_context, + node_churn_cycles=args.node_churn_cycles, + node_churn_delta=args.node_churn_delta, + node_churn_settle_seconds=args.node_churn_settle_seconds, + node_churn_scale_duration_seconds=args.node_churn_scale_duration_seconds, + node_churn_replace_duration_seconds=args.node_churn_replace_duration_seconds, + node_churn_combined_duration_seconds=args.node_churn_combined_duration_seconds, + node_replace_batch_size=args.node_replace_batch_size, + node_churn_ready_timeout_seconds=args.node_churn_ready_timeout_seconds, + saturation_qps_list=args.saturation_qps_list, + saturation_restarts_list=args.saturation_restarts_list, + saturation_rung_duration_seconds=args.saturation_rung_duration_seconds, + saturation_settle_seconds=args.saturation_settle_seconds, ) elif args.command == "execute": execute_clusterloader2( @@ -232,7 +1538,23 @@ def main(): args.cl2_config_file, args.kubeconfig, args.provider, + tear_down_prometheus=args.tear_down_prometheus, + ) + elif args.command == "execute-parallel": + rc = execute_parallel( + clusters_file=args.clusters, + max_concurrent=args.max_concurrent, + worker_script=args.worker_script, + cl2_image=args.cl2_image, + cl2_config_dir=args.cl2_config_dir, + cl2_config_file=args.cl2_config_file, + cl2_report_dir_base=args.cl2_report_dir_base, + provider=args.provider, + python_script_file=args.python_script_file, + python_workdir=args.python_workdir, + tear_down_prometheus=args.tear_down_prometheus, ) + sys.exit(rc) elif args.command == "collect": collect_clusterloader2( args.cl2_report_dir, @@ -249,6 +1571,14 @@ def main(): args.deployments_per_namespace, args.replicas_per_deployment, args.trigger_reason, + churn_cycles=args.churn_cycles, + churn_up_duration=args.churn_up_duration, + churn_down_duration=args.churn_down_duration, + kill_duration_seconds=args.kill_duration_seconds, + kill_interval_seconds=args.kill_interval_seconds, + kill_batch=args.kill_batch, + saturation_qps_list=args.saturation_qps_list, + saturation_restarts_list=args.saturation_restarts_list, ) else: parser.print_help() diff --git a/modules/python/tests/mock_data/.gitignore b/modules/python/tests/mock_data/.gitignore new file mode 100644 index 0000000000..49abfda49a --- /dev/null +++ b/modules/python/tests/mock_data/.gitignore @@ -0,0 +1,8 @@ +# Mock fixture log files are intentionally checked in (synthetic content, +# bytes-small) so test_clustermesh_scale's TestMockFixtureParity can verify +# the mock matches what run-cl2-on-cluster.sh produces in real runs. +# Without this exception the root *.log ignore strips them, the parity test +# fails locally on a fresh clone, and collect_clusterloader2 tests don't +# exercise the logs/-subdir-present shape — the exact gap that let an +# IsADirectoryError land in CI. +!*.log diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-agent.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-agent.log new file mode 100644 index 0000000000..ac2b9403b1 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-agent.log @@ -0,0 +1 @@ +# synthetic cilium-agent.log for mesh-1 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-operator.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-operator.log new file mode 100644 index 0000000000..2d665012b3 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-operator.log @@ -0,0 +1 @@ +# synthetic cilium-operator.log for mesh-1 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-apiserver.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-apiserver.log new file mode 100644 index 0000000000..786823cedc --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-apiserver.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-apiserver.log for mesh-1 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-etcd.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-etcd.log new file mode 100644 index 0000000000..620dc1d5e0 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-etcd.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-etcd.log for mesh-1 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-kvstoremesh.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-kvstoremesh.log new file mode 100644 index 0000000000..ae2fb8cd9c --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-kvstoremesh.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-kvstoremesh.log for mesh-1 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-agent.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-agent.log new file mode 100644 index 0000000000..2e0dda9c48 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-agent.log @@ -0,0 +1 @@ +# synthetic cilium-agent.log for mesh-2 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-operator.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-operator.log new file mode 100644 index 0000000000..e4b00b1cc9 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-operator.log @@ -0,0 +1 @@ +# synthetic cilium-operator.log for mesh-2 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-apiserver.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-apiserver.log new file mode 100644 index 0000000000..af21cefef0 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-apiserver.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-apiserver.log for mesh-2 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-etcd.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-etcd.log new file mode 100644 index 0000000000..5422124e72 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-etcd.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-etcd.log for mesh-2 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-kvstoremesh.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-kvstoremesh.log new file mode 100644 index 0000000000..279d5da2e5 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-kvstoremesh.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-kvstoremesh.log for mesh-2 (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-agent.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-agent.log new file mode 100644 index 0000000000..d5c76f10b4 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-agent.log @@ -0,0 +1 @@ +# synthetic cilium-agent.log for mesh-fail (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-operator.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-operator.log new file mode 100644 index 0000000000..c404208c5c --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-operator.log @@ -0,0 +1 @@ +# synthetic cilium-operator.log for mesh-fail (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-apiserver.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-apiserver.log new file mode 100644 index 0000000000..ab1ad57a6a --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-apiserver.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-apiserver.log for mesh-fail (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-etcd.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-etcd.log new file mode 100644 index 0000000000..01e52d4c6d --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-etcd.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-etcd.log for mesh-fail (mock fixture) diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-kvstoremesh.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-kvstoremesh.log new file mode 100644 index 0000000000..6e347842d5 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-kvstoremesh.log @@ -0,0 +1 @@ +# synthetic clustermesh-apiserver-kvstoremesh.log for mesh-fail (mock fixture) diff --git a/modules/python/tests/test_clustermesh_scale.py b/modules/python/tests/test_clustermesh_scale.py index 0b9dd7510e..afb42522ac 100644 --- a/modules/python/tests/test_clustermesh_scale.py +++ b/modules/python/tests/test_clustermesh_scale.py @@ -11,11 +11,17 @@ this, downstream Kusto queries cannot group/filter by cluster across the mesh. """ import importlib.util +import io import json import os +import shutil import sys import tempfile +import threading +import time import unittest +from contextlib import redirect_stdout +from glob import glob from pathlib import Path from unittest.mock import patch @@ -41,6 +47,71 @@ os.path.dirname(__file__), "mock_data", "clustermesh-scale", "report" ) +# Files/dirs that run-cl2-on-cluster.sh writes into every per-cluster +# $report_dir. Any new artifact added there MUST be mirrored in +# mock_data/clustermesh-scale/report/mesh-*/ so the local test suite +# exercises the same shape collect_clusterloader2 sees in real runs. +# The TestMockFixtureParity class below enforces this. +EXPECTED_PER_CLUSTER_ARTIFACTS = { + "files": ["junit.xml"], + "file_globs": ["*.json"], + "subdirs": ["logs"], + "logs_files": [ + "clustermesh-apiserver-apiserver.log", + "clustermesh-apiserver-etcd.log", + "clustermesh-apiserver-kvstoremesh.log", + "cilium-agent.log", + "cilium-operator.log", + ], +} + + +class TestMockFixtureParity(unittest.TestCase): + """Mock data must mirror the real run-cl2-on-cluster.sh output layout. + + Without this, collect_clusterloader2 tests can pass against a stale + mock while real runs crash on shapes the mock doesn't include — + exactly the IsADirectoryError on logs/ regression that triggered + adding this guard. + """ + + def _assert_cluster_dir_shape(self, cluster_dir): + for fname in EXPECTED_PER_CLUSTER_ARTIFACTS["files"]: + self.assertTrue( + os.path.isfile(os.path.join(cluster_dir, fname)), + f"{cluster_dir}: missing required file {fname}", + ) + for pattern in EXPECTED_PER_CLUSTER_ARTIFACTS["file_globs"]: + self.assertTrue( + glob(os.path.join(cluster_dir, pattern)), + f"{cluster_dir}: no file matches {pattern}", + ) + for sd in EXPECTED_PER_CLUSTER_ARTIFACTS["subdirs"]: + self.assertTrue( + os.path.isdir(os.path.join(cluster_dir, sd)), + f"{cluster_dir}: missing required subdir {sd}/ " + f"(run-cl2-on-cluster.sh writes this; " + f"keep the mock in sync so collect tests stay realistic)", + ) + log_dir = os.path.join(cluster_dir, "logs") + for lf in EXPECTED_PER_CLUSTER_ARTIFACTS["logs_files"]: + self.assertTrue( + os.path.isfile(os.path.join(log_dir, lf)), + f"{log_dir}: missing log file {lf}", + ) + + def test_mesh_1_mock_matches_engine_output(self): + """mesh-1 mock has the same shape as a real per-cluster report dir.""" + self._assert_cluster_dir_shape(os.path.join(MOCK_REPORT_ROOT, "mesh-1")) + + def test_mesh_2_mock_matches_engine_output(self): + """mesh-2 mock has the same shape as a real per-cluster report dir.""" + self._assert_cluster_dir_shape(os.path.join(MOCK_REPORT_ROOT, "mesh-2")) + + def test_mesh_fail_mock_matches_engine_output(self): + """mesh-fail mock has the same shape as a real per-cluster report dir.""" + self._assert_cluster_dir_shape(os.path.join(MOCK_REPORT_ROOT, "mesh-fail")) + class TestConfigureClustermeshScale(unittest.TestCase): """configure_clusterloader2 writes the CL2 overrides file the pipeline expects.""" @@ -72,7 +143,7 @@ def test_overrides_file_contents(self): # Prometheus pod to the dedicated `prompool` node defined in # azure-2.tfvars (label prometheus=true). self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content) - self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi", content) + self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT: 12Gi", content) self.assertIn('CL2_PROMETHEUS_NODE_SELECTOR: "prometheus: \\"true\\""', content) self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true", content) self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true", content) @@ -110,6 +181,872 @@ def test_overrides_file_timeout_passthrough(self): finally: os.remove(tmp_path) + def test_overrides_file_emits_phase4a_pod_churn_defaults(self): + """Every CL2_* knob the pod-churn-{scale,kill}.yaml templates read must + be emitted by configure_clusterloader2, even when not passed explicitly — + so an event-throughput run that omits the churn args still produces + a valid overrides file that pod-churn templates would accept. + + Defaults must match the documented Phase 4a defaults in plan.md. + """ + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + # pod-churn-scale knobs. + self.assertIn("CL2_CHURN_CYCLES: 5", content) + self.assertIn("CL2_CHURN_UP_DURATION: 60s", content) + self.assertIn("CL2_CHURN_DOWN_DURATION: 60s", content) + # pod-churn-kill knobs. + self.assertIn("CL2_KILL_DURATION: 10m", content) + self.assertIn("CL2_KILL_INTERVAL_SECONDS: 10", content) + self.assertIn("CL2_KILL_BATCH: 5", content) + self.assertIn("CL2_KILL_DURATION_SECONDS: 600", content) + # Job deadline must exceed kill_duration so the activeDeadlineSeconds + # safety net never fires before the killer's own time check. + self.assertIn("CL2_KILL_JOB_DEADLINE_SECONDS: 660", content) + finally: + os.remove(tmp_path) + + def test_overrides_file_pod_churn_overrides_passthrough(self): + """Explicit churn args override the defaults in the overrides file.""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + operation_timeout="20m", + override_file=tmp_path, + churn_cycles=3, + churn_up_duration="30s", + churn_down_duration="45s", + kill_duration="5m", + kill_interval_seconds=15, + kill_batch=3, + kill_duration_seconds=300, + kill_job_deadline_seconds=360, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn("CL2_CHURN_CYCLES: 3", content) + self.assertIn("CL2_CHURN_UP_DURATION: 30s", content) + self.assertIn("CL2_CHURN_DOWN_DURATION: 45s", content) + self.assertIn("CL2_KILL_DURATION: 5m", content) + self.assertIn("CL2_KILL_INTERVAL_SECONDS: 15", content) + self.assertIn("CL2_KILL_BATCH: 3", content) + self.assertIn("CL2_KILL_DURATION_SECONDS: 300", content) + self.assertIn("CL2_KILL_JOB_DEADLINE_SECONDS: 360", content) + finally: + os.remove(tmp_path) + + def test_overrides_file_apiserver_failure_defaults(self): + """Phase 4b — Scenario #4 (APIServer Failure) knobs landed in overrides + with the documented defaults. + + Same unconditional-write pattern as churn knobs: every configure call + writes these keys so a future event-throughput run with this overrides + file still produces a valid (if unused) override set for the apiserver + templates. + """ + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn("CL2_APISERVER_KILL_TARGET_CONTEXT: clustermesh-1", content) + self.assertIn("CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS: 240", content) + self.assertIn("CL2_APISERVER_KILL_OBSERVATION_SECONDS: 60", content) + finally: + os.remove(tmp_path) + + def test_overrides_file_apiserver_failure_overrides_passthrough(self): + """Explicit apiserver-failure args override the defaults.""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + apiserver_kill_target_context="clustermesh-5", + apiserver_kill_recovery_timeout_seconds=180, + apiserver_kill_observation_seconds=90, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn("CL2_APISERVER_KILL_TARGET_CONTEXT: clustermesh-5", content) + self.assertIn("CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS: 180", content) + self.assertIn("CL2_APISERVER_KILL_OBSERVATION_SECONDS: 90", content) + finally: + os.remove(tmp_path) + + def test_overrides_file_ha_config_replicas_default(self): + """ha-config replicas default to 3 (standard k8s HA).""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn("CL2_HA_CONFIG_REPLICAS: 3", content) + finally: + os.remove(tmp_path) + + def test_overrides_file_ha_config_replicas_passthrough(self): + """Explicit ha_config_replicas overrides the default.""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + ha_config_replicas=5, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn("CL2_HA_CONFIG_REPLICAS: 5", content) + finally: + os.remove(tmp_path) + + +class TestApiserverFailureTimingPickup(unittest.TestCase): + """collect_clusterloader2 appends a row from ApiserverFailureTimings_*.json + if it finds one in the report dir. This is the Phase 4b mechanism for + surfacing the killer script's recorded timestamps into the JSONL — vanilla + process_cl2_reports() doesn't recognize the file pattern. + """ + + def test_timing_file_appends_row(self): + with tempfile.TemporaryDirectory() as tmp: + # Copy the mock report dir so we can add a timing file alongside. + src = os.path.join(MOCK_REPORT_ROOT, "mesh-1") + report_dir = os.path.join(tmp, "mesh-1") + shutil.copytree(src, report_dir) + timing_path = os.path.join( + report_dir, "ApiserverFailureTimings_clustermesh-1.json" + ) + with open(timing_path, "w", encoding="utf-8") as f: + json.dump({ + "target_context": "clustermesh-1", + "t0_kill_epoch": 1746000000, + "t1_recovered_epoch": 1746000035, + "recovery_duration_seconds": 35, + "recovered": True, + "killed_pod_name": "clustermesh-apiserver-abc", + "killed_pod_uid": "old-uid", + "replacement_pod_uid": "new-uid", + "note": "ok", + }, f) + + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info="", + run_id="apf-test", + run_url="", + result_file=result_file, + test_type="apiserver-failure", + start_timestamp="2026-05-12T20:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n")] + # At least one ApiserverFailureRecoveryTiming row appended + timing_rows = [ + r for r in lines + if r.get("measurement") == "ApiserverFailureRecoveryTiming" + ] + self.assertEqual(len(timing_rows), 1) + tr = timing_rows[0] + self.assertEqual(tr["group"], "apiserver-failure") + self.assertEqual(tr["test_type"], "apiserver-failure") + self.assertEqual(tr["cluster"], "mesh-1") + self.assertEqual(tr["result"]["unit"], "seconds") + data = tr["result"]["data"] + self.assertEqual(data["target_context"], "clustermesh-1") + self.assertEqual(data["recovery_duration_seconds"], 35) + self.assertTrue(data["recovered"]) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_no_timing_file_means_no_extra_row(self): + """Non-target clusters skip writing the timing file; collect must not + emit any ApiserverFailureRecoveryTiming row for those clusters. + """ + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-2"), + cloud_info="", + run_id="apf-test-no-timing", + run_url="", + result_file=result_file, + test_type="apiserver-failure", + start_timestamp="2026-05-12T20:00:00Z", + cluster_name="mesh-2", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + timing_rows = [ + r for r in lines + if r.get("measurement") == "ApiserverFailureRecoveryTiming" + ] + self.assertEqual(len(timing_rows), 0) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + +class TestHAConfigScalingTimingPickup(unittest.TestCase): + """collect_clusterloader2 appends a row from HAConfigScalingTimings_*.json + if it finds one in the report dir. ha-config-scaler.sh writes the file + on every cluster (not just target) — mesh-wide HA scaling. + """ + def test_scaling_file_appends_row(self): + with tempfile.TemporaryDirectory() as tmp: + src = os.path.join(MOCK_REPORT_ROOT, "mesh-1") + report_dir = os.path.join(tmp, "mesh-1") + shutil.copytree(src, report_dir) + scaling_path = os.path.join( + report_dir, "HAConfigScalingTimings_clustermesh-1.json" + ) + with open(scaling_path, "w", encoding="utf-8") as f: + json.dump({ + "context": "clustermesh-1", + "action": "scale-up", + "requested_replicas": 3, + "spec_replicas_after": 3, + "ready_replicas_after": 3, + "ha_replicas_honored": True, + "scale_duration_seconds": 42, + "note": "ok", + }, f) + + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info="", + run_id="ha-test", + run_url="", + result_file=result_file, + test_type="ha-config", + start_timestamp="2026-05-13T20:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n")] + scaling_rows = [ + r for r in lines + if r.get("measurement") == "HAConfigScalingTiming" + ] + self.assertEqual(len(scaling_rows), 1) + sr = scaling_rows[0] + self.assertEqual(sr["group"], "ha-config") + self.assertEqual(sr["test_type"], "ha-config") + self.assertEqual(sr["cluster"], "mesh-1") + self.assertEqual(sr["result"]["unit"], "seconds") + data = sr["result"]["data"] + self.assertEqual(data["requested_replicas"], 3) + self.assertEqual(data["spec_replicas_after"], 3) + self.assertTrue(data["ha_replicas_honored"]) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_no_scaling_file_means_no_extra_row(self): + """Without a scaling JSON, no HAConfigScalingTiming row is emitted + (covers the non-ha-config scenario case, where the scaler isn't run). + """ + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-2"), + cloud_info="", + run_id="ha-test-no-scaling", + run_url="", + result_file=result_file, + test_type="event-throughput", + start_timestamp="2026-05-13T20:00:00Z", + cluster_name="mesh-2", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + scaling_rows = [ + r for r in lines + if r.get("measurement") == "HAConfigScalingTiming" + ] + self.assertEqual(len(scaling_rows), 0) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + +class TestConfigureNodeChurnKnobs(unittest.TestCase): + """Phase 4b — Scenario #3 (Node Churn / IP Churn) overrides flow through + configure_clusterloader2 and land in the CL2 overrides file with the + expected CL2_NODE_CHURN_* keys. + """ + + def test_node_churn_defaults_emitted(self): + """Defaults match scale.py argparse + node-churner.sh expectations.""" + with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn("CL2_NODE_CHURN_TARGET_CONTEXT: clustermesh-1", content) + self.assertIn("CL2_NODE_CHURN_CYCLES: 3", content) + self.assertIn("CL2_NODE_CHURN_DELTA: 5", content) + self.assertIn("CL2_NODE_CHURN_SETTLE_SECONDS: 60", content) + self.assertIn("CL2_NODE_CHURN_SCALE_DURATION_SECONDS: 1800", content) + self.assertIn("CL2_NODE_CHURN_REPLACE_DURATION_SECONDS: 1500", content) + self.assertIn("CL2_NODE_CHURN_COMBINED_DURATION_SECONDS: 3300", content) + self.assertIn("CL2_NODE_REPLACE_BATCH_SIZE: 10", content) + self.assertIn("CL2_NODE_CHURN_READY_TIMEOUT_SECONDS: 300", content) + finally: + os.remove(tmp_path) + + def test_node_churn_overrides_passthrough(self): + """Explicit kwargs override defaults; per-tier matrix overrides land.""" + with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + node_churn_target_context="clustermesh-7", + node_churn_cycles=5, + node_churn_delta=3, + node_churn_settle_seconds=90, + node_churn_scale_duration_seconds=2400, + node_churn_replace_duration_seconds=2000, + node_churn_combined_duration_seconds=4500, + node_replace_batch_size=8, + node_churn_ready_timeout_seconds=180, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn("CL2_NODE_CHURN_TARGET_CONTEXT: clustermesh-7", content) + self.assertIn("CL2_NODE_CHURN_CYCLES: 5", content) + self.assertIn("CL2_NODE_CHURN_DELTA: 3", content) + self.assertIn("CL2_NODE_CHURN_SETTLE_SECONDS: 90", content) + self.assertIn("CL2_NODE_CHURN_SCALE_DURATION_SECONDS: 2400", content) + self.assertIn("CL2_NODE_CHURN_REPLACE_DURATION_SECONDS: 2000", content) + self.assertIn("CL2_NODE_CHURN_COMBINED_DURATION_SECONDS: 4500", content) + self.assertIn("CL2_NODE_REPLACE_BATCH_SIZE: 8", content) + self.assertIn("CL2_NODE_CHURN_READY_TIMEOUT_SECONDS: 180", content) + finally: + os.remove(tmp_path) + + +class TestNodeChurnTimingPickup(unittest.TestCase): + """collect_clusterloader2 appends one NodeChurnSummary row + one + NodeChurnOpTiming row per op from NodeChurnTimings_*.json. node-churner.sh + writes the file ONLY in the target cluster's report dir (the script runs + on the host, not inside CL2; the file lives in the target's per-cluster + report dir so the existing per-cluster collect pickup works). + """ + + def _write_timing(self, report_dir, target_context, ops=None, + scenario="node-churn-combined", + ready_quorum_reached=True, + scenario_valid=True, cleanup_failed=False, + truncated=False): + ops = ops or [] + path = os.path.join(report_dir, f"NodeChurnTimings_{target_context}.json") + with open(path, "w", encoding="utf-8") as f: + json.dump({ + "scenario": scenario, + "target_context": target_context, + "target_cluster_name": target_context, + "target_resource_group": "test-rg", + "target_nodepool": "default", + "target_node_resource_group": f"MC_test-rg_{target_context}_eastus2", + "target_vmss": "aks-default-12345", + "original_node_count": 20, + "ready_quorum_reached": ready_quorum_reached, + "scenario_valid": scenario_valid, + "cleanup_failed": cleanup_failed, + "truncated": truncated, + "started_epoch": 1746000000, + "ended_epoch": 1746001500, + "duration_seconds": 1500, + "ops": ops, + }, f) + return path + + def test_timing_file_emits_summary_and_op_rows(self): + with tempfile.TemporaryDirectory() as tmp: + src = os.path.join(MOCK_REPORT_ROOT, "mesh-1") + report_dir = os.path.join(tmp, "mesh-1") + shutil.copytree(src, report_dir) + self._write_timing(report_dir, "clustermesh-1", ops=[ + { + "op_index": 1, "op_type": "scale_up", + "start_epoch": 1746000010, "end_epoch": 1746000200, + "duration_seconds": 190, "succeeded": True, + "observed_node_count": 25, + "pre_ip_set": [], "post_ip_set": [], "new_ip_count": 0, + "error": "", + }, + { + "op_index": 2, "op_type": "scale_down", + "start_epoch": 1746000260, "end_epoch": 1746000450, + "duration_seconds": 190, "succeeded": True, + "observed_node_count": 20, + "pre_ip_set": [], "post_ip_set": [], "new_ip_count": 0, + "error": "", + }, + { + "op_index": 3, "op_type": "replace_wait", + "start_epoch": 1746000500, "end_epoch": 1746001100, + "duration_seconds": 600, "succeeded": True, + "observed_node_count": 20, + "pre_ip_set": ["10.1.0.4", "10.1.0.19"], + "post_ip_set": ["10.1.0.4", "10.1.0.19"], + "pre_node_names": ["aks-default-vmss000004", "aks-default-vmss00000j"], + "post_node_names": ["aks-default-vmss000004", "aks-default-vmss00000k"], + "new_ip_count": 0, + "new_node_count": 1, + "error": "", + }, + ]) + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info="", + run_id="nc-test", + run_url="", + result_file=result_file, + test_type="node-churn-combined", + start_timestamp="2026-05-13T20:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"] + ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"] + self.assertEqual(len(summary), 1) + self.assertEqual(len(ops), 3) + s = summary[0] + self.assertEqual(s["group"], "node-churn-combined") + self.assertEqual(s["test_type"], "node-churn-combined") + self.assertEqual(s["cluster"], "mesh-1") + self.assertEqual(s["result"]["data"]["op_count"], 3) + self.assertEqual(s["result"]["data"]["original_node_count"], 20) + self.assertTrue(s["result"]["data"]["ready_quorum_reached"]) + self.assertTrue(s["result"]["data"]["scenario_valid"]) + # ops sorted by op_index + op_types = [o["result"]["data"]["op_type"] for o in ops] + self.assertEqual(set(op_types), {"scale_up", "scale_down", "replace_wait"}) + # scenario-level context merged onto op rows + for op_row in ops: + self.assertEqual(op_row["result"]["data"]["scenario"], "node-churn-combined") + self.assertEqual(op_row["result"]["data"]["target_context"], "clustermesh-1") + # replace_wait op carries IP set + node name deltas. + # Build 67155: new_ip_count is informational (Azure can reuse IPs); + # new_node_count is the authoritative replacement signal. + replace = [o for o in ops if o["result"]["data"]["op_type"] == "replace_wait"][0] + self.assertEqual(replace["result"]["data"]["new_ip_count"], 0) + self.assertEqual(replace["result"]["data"]["new_node_count"], 1, + "node name delta is the authoritative replacement signal") + self.assertIn("aks-default-vmss00000k", + replace["result"]["data"]["post_node_names"]) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_timing_file_with_empty_ops_emits_summary_only(self): + """Ready-quorum-never-reached case: timing file exists with ops=[], + scenario_valid=false. Summary row still emitted so Kusto can detect + the aborted run; no op rows.""" + with tempfile.TemporaryDirectory() as tmp: + src = os.path.join(MOCK_REPORT_ROOT, "mesh-1") + report_dir = os.path.join(tmp, "mesh-1") + shutil.copytree(src, report_dir) + self._write_timing( + report_dir, "clustermesh-1", ops=[], + ready_quorum_reached=False, scenario_valid=False, + ) + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info="", + run_id="nc-test-abort", + run_url="", + result_file=result_file, + test_type="node-churn-scale", + start_timestamp="2026-05-13T20:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"] + ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"] + self.assertEqual(len(summary), 1) + self.assertEqual(len(ops), 0) + self.assertFalse(summary[0]["result"]["data"]["ready_quorum_reached"]) + self.assertFalse(summary[0]["result"]["data"]["scenario_valid"]) + self.assertEqual(summary[0]["result"]["data"]["op_count"], 0) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_timing_file_with_cleanup_failed_marks_summary(self): + """If node-churner finalizer can't restore the pool, cleanup_failed=true. + execute.yml uses this to break the share-infra loop; collect must still + emit the summary row with cleanup_failed=true visible.""" + with tempfile.TemporaryDirectory() as tmp: + src = os.path.join(MOCK_REPORT_ROOT, "mesh-1") + report_dir = os.path.join(tmp, "mesh-1") + shutil.copytree(src, report_dir) + self._write_timing( + report_dir, "clustermesh-1", + ops=[{ + "op_index": 1, "op_type": "scale_up", + "start_epoch": 1746000010, "end_epoch": 1746000200, + "duration_seconds": 190, "succeeded": False, + "observed_node_count": 0, + "pre_ip_set": [], "post_ip_set": [], "new_ip_count": 0, + "error": "OperationNotAllowed", + }], + cleanup_failed=True, scenario_valid=False, + ) + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info="", + run_id="nc-test-cleanup", + run_url="", + result_file=result_file, + test_type="node-churn-combined", + start_timestamp="2026-05-13T20:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"] + self.assertEqual(len(summary), 1) + self.assertTrue(summary[0]["result"]["data"]["cleanup_failed"]) + # failed op still surfaces with succeeded=false + ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"] + self.assertEqual(len(ops), 1) + self.assertFalse(ops[0]["result"]["data"]["succeeded"]) + self.assertIn("OperationNotAllowed", ops[0]["result"]["data"]["error"]) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_no_timing_file_means_no_node_churn_rows(self): + """Non-target clusters (and non-node-churn scenarios) skip writing + the timing file → no NodeChurnSummary / NodeChurnOpTiming rows.""" + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-2"), + cloud_info="", + run_id="nc-test-no-timing", + run_url="", + result_file=result_file, + test_type="node-churn-scale", + start_timestamp="2026-05-13T20:00:00Z", + cluster_name="mesh-2", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"] + ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"] + self.assertEqual(len(summary), 0) + self.assertEqual(len(ops), 0) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + +class TestWriteReadySentinelScript(unittest.TestCase): + """write-ready-sentinel.sh derives a unique context per CL2 invocation + and writes a non-empty sentinel filename. Build 67114 regression: the + original inline `bash -c` Method:Exec returned an empty context name, + causing both clusters to write the same path (ready-) and one to + overwrite the other → barrier saw 1/2 → scenario aborted. + + The fix relies on parsing /root/.kube/config directly (CL2 bind-mounts + the per-cluster kubeconfig there). These tests confirm the resolution + chain (kubeconfig-parse > kubectl-PATH > kubectl-prestaged > server-hash + > hostname > pid-fallback) and that the sentinel filename always has + a non-empty suffix. + """ + + SCRIPT_PATH = ( + Path(__file__).resolve().parents[1] + / "clusterloader2" / "clustermesh-scale" / "config" / "write-ready-sentinel.sh" + ) + + def _run_with_kubeconfig(self, kubeconfig_content, td): + import subprocess + kubeconfig = os.path.join(td, "kubeconfig") + with open(kubeconfig, "w", encoding="utf-8") as f: + f.write(kubeconfig_content) + sentinel_dir = os.path.join(td, "sentinels") + os.makedirs(sentinel_dir, exist_ok=True) + env = os.environ.copy() + env["KUBECONFIG"] = kubeconfig + result = subprocess.run( + ["bash", str(self.SCRIPT_PATH), sentinel_dir], + capture_output=True, text=True, env=env, check=False, + timeout=10, + ) + return result, sentinel_dir + + def test_kubeconfig_parse_resolves_current_context(self): + kc = ( + "apiVersion: v1\n" + "clusters:\n" + "- cluster:\n" + " server: https://test1.example.com:443\n" + " name: clustermesh-1\n" + "contexts:\n" + "- context:\n" + " cluster: clustermesh-1\n" + " name: clustermesh-1\n" + "current-context: clustermesh-1\n" + ) + with tempfile.TemporaryDirectory() as td: + result, sentinel_dir = self._run_with_kubeconfig(kc, td) + self.assertEqual(result.returncode, 0, f"stderr={result.stderr}") + files = os.listdir(sentinel_dir) + self.assertEqual(files, ["ready-clustermesh-1"]) + self.assertIn("via kubeconfig-parse", result.stderr) + + def test_different_kubeconfigs_yield_distinct_sentinels(self): + """Build 67114 regression: two clusters MUST NOT write the same + sentinel path (otherwise the second's write silently overwrites + the first, breaking the quorum count).""" + kc1 = "current-context: clustermesh-1\n" + kc2 = "current-context: clustermesh-2\n" + with tempfile.TemporaryDirectory() as td1, tempfile.TemporaryDirectory() as td2: + r1, sd1 = self._run_with_kubeconfig(kc1, td1) + r2, sd2 = self._run_with_kubeconfig(kc2, td2) + self.assertEqual(r1.returncode, 0) + self.assertEqual(r2.returncode, 0) + self.assertEqual(os.listdir(sd1), ["ready-clustermesh-1"]) + self.assertEqual(os.listdir(sd2), ["ready-clustermesh-2"]) + + def test_empty_current_context_falls_back_to_server_hash(self): + """If current-context line is missing/blank, fall back to a hash of + the server URL. Two different servers MUST yield different hashes.""" + kc1 = ( + "apiVersion: v1\n" + "clusters:\n" + "- cluster:\n" + " server: https://serverA.example.com:443\n" + " name: foo\n" + ) + kc2 = ( + "apiVersion: v1\n" + "clusters:\n" + "- cluster:\n" + " server: https://serverB.example.com:443\n" + " name: foo\n" + ) + with tempfile.TemporaryDirectory() as td1, tempfile.TemporaryDirectory() as td2: + r1, sd1 = self._run_with_kubeconfig(kc1, td1) + r2, sd2 = self._run_with_kubeconfig(kc2, td2) + self.assertEqual(r1.returncode, 0) + self.assertEqual(r2.returncode, 0) + f1 = os.listdir(sd1)[0] + f2 = os.listdir(sd2)[0] + self.assertNotEqual(f1, f2, + f"server-hash collision: {f1} == {f2}") + + def test_sentinel_filename_always_non_empty_suffix(self): + """Whatever the resolution path, the sentinel filename suffix is + never empty (avoids the build 67114 path-collision regression).""" + kc = "" + with tempfile.TemporaryDirectory() as td: + r, sd = self._run_with_kubeconfig(kc, td) + self.assertEqual(r.returncode, 0, f"stderr={r.stderr}") + files = os.listdir(sd) + self.assertEqual(len(files), 1) + self.assertNotEqual(files[0], "ready-", + "sentinel filename has empty suffix — build 67114 regression") + self.assertTrue(files[0].startswith("ready-")) + self.assertGreater(len(files[0]), len("ready-")) + + +class TestNodeChurnerScript(unittest.TestCase): + """node-churner.sh smoke tests — bash -n syntax + arg validation. The + script's full Azure CLI behavior cannot be unit-tested without mocking + the cloud, but its argparse-equivalent + missing-binary fail-soft path + can. + """ + + SCRIPT_PATH = ( + Path(__file__).resolve().parents[1] + / "clusterloader2" / "clustermesh-scale" / "config" / "node-churner.sh" + ) + + def test_script_exists_and_is_executable(self): + self.assertTrue(self.SCRIPT_PATH.exists(), + f"{self.SCRIPT_PATH} should exist") + self.assertTrue( + os.access(self.SCRIPT_PATH, os.X_OK), + f"{self.SCRIPT_PATH} must be executable", + ) + + def test_script_bash_syntax(self): + import subprocess + result = subprocess.run( + ["bash", "-n", str(self.SCRIPT_PATH)], + capture_output=True, text=True, check=False, + ) + self.assertEqual(result.returncode, 0, + f"bash -n failed: stderr={result.stderr}") + + def test_script_aborts_softly_when_az_missing(self): + """When `az` CLI isn't on PATH, the script writes a timing file with + scenario_valid=false instead of erroring out (so execute.yml's + share-infra loop continues to subsequent scenarios with clean data). + """ + import subprocess + with tempfile.TemporaryDirectory() as tmp: + report_dir = os.path.join(tmp, "report") + sentinel_dir = os.path.join(tmp, "sentinels") + os.makedirs(report_dir, exist_ok=True) + os.makedirs(sentinel_dir, exist_ok=True) + env = os.environ.copy() + env["PATH"] = "/usr/bin:/bin" # strip out any az + result = subprocess.run( + [ + "bash", str(self.SCRIPT_PATH), + "node-churn-scale", # scenario + "clustermesh-1", # target cluster name + "test-rg", # target rg + "default", # target nodepool + report_dir, # report dir + sentinel_dir, # sentinel dir + "2", # cluster count + "1", "1", "1", "1", "30", "60", # remaining knobs + ], + capture_output=True, text=True, env=env, check=False, + timeout=30, + ) + # Soft-fail contract: exit 0 even when az is missing. + self.assertEqual(result.returncode, 0, + f"expected soft-fail (rc=0); got rc={result.returncode}, " + f"stderr={result.stderr}") + timing_file = os.path.join(report_dir, "NodeChurnTimings_clustermesh-1.json") + self.assertTrue(os.path.exists(timing_file), + "timing file should still be written on soft-fail") + with open(timing_file, "r", encoding="utf-8") as f: + data = json.load(f) + self.assertFalse(data["scenario_valid"], + "scenario_valid must be false when az is missing") + class TestCollectSingleCluster(unittest.TestCase): """collect_clusterloader2 emits one JSONL row per call, tagged with cluster identity.""" @@ -221,6 +1158,135 @@ def test_collect_propagates_test_type(self): if os.path.exists(result_file): os.remove(result_file) + def test_collect_records_pod_churn_knobs(self): + """Phase 4a — pod-churn scenarios record churn knobs on every row. + + Spec line 67 ("CPU/memory growth over time") requires historical + comparison across runs with potentially-different churn parameters. + Recording the knobs on the row means a future query for + ``churn_cycles==5 AND kill_batch==5`` returns only directly-comparable + rows. Non-churn test_types default to 0/"" — Kusto-friendly nulls. + """ + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-1"), + cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}), + run_id="test-run-churn", + run_url="http://example.com/runchurn", + result_file=result_file, + test_type="pod-churn-scale", + start_timestamp="2026-04-28T15:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + churn_cycles=5, + churn_up_duration="60s", + churn_down_duration="60s", + kill_duration_seconds=600, + kill_interval_seconds=10, + kill_batch=5, + ) + with open(result_file, "r", encoding="utf-8") as f: + row = json.loads(f.read().strip().split("\n")[0]) + # Top-level fields — Kusto column convenience. + self.assertEqual(row["churn_cycles"], 5) + self.assertEqual(row["kill_duration_seconds"], 600) + self.assertEqual(row["kill_interval_seconds"], 10) + self.assertEqual(row["kill_batch"], 5) + # Nested in test_details for richer queries. + details = row["test_details"] + self.assertEqual(details["churn_cycles"], 5) + self.assertEqual(details["churn_up_duration"], "60s") + self.assertEqual(details["churn_down_duration"], "60s") + self.assertEqual(details["kill_duration_seconds"], 600) + self.assertEqual(details["kill_interval_seconds"], 10) + self.assertEqual(details["kill_batch"], 5) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_pod_churn_knobs_default_to_zero_for_non_churn_runs(self): + """Non-churn collect calls omit the churn knobs; defaults must be 0/"" + so the JSONL row is still schema-stable for Kusto (no missing fields). + """ + result_file = self._collect(cluster_name="mesh-1", test_type="event-throughput") + try: + with open(result_file, "r", encoding="utf-8") as f: + row = json.loads(f.read().strip().split("\n")[0]) + self.assertEqual(row["churn_cycles"], 0) + self.assertEqual(row["kill_duration_seconds"], 0) + self.assertEqual(row["kill_interval_seconds"], 0) + self.assertEqual(row["kill_batch"], 0) + self.assertEqual(row["test_details"]["churn_up_duration"], "") + self.assertEqual(row["test_details"]["churn_down_duration"], "") + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_skips_any_subdir_under_report_dir(self): + """process_cl2_reports open()s every dir entry, so ANY subdir trips it. + + Today only logs/ exists (pod log capture from run-cl2-on-cluster.sh). + Tomorrow could be phase-logs/ from a CL2 version bump, additional + diag dumps, etc. collect_clusterloader2 must stash every subdir + outside the report dir during the parse and restore each one + afterward so the pipeline-level artifact publish still picks them up. + """ + src = os.path.join(MOCK_REPORT_ROOT, "mesh-1") + with tempfile.TemporaryDirectory() as tmp: + report_dir = os.path.join(tmp, "mesh-1") + shutil.copytree(src, report_dir) + # mesh-1 fixture already ships logs/; add two more synthetic + # subdirs to lock in the "skip ALL subdirs" contract. + extra_subdirs = { + "phase-logs": "phase-0.log", + "diag-dump": "events.txt", + } + for sd, fname in extra_subdirs.items(): + sd_path = os.path.join(report_dir, sd) + os.makedirs(sd_path, exist_ok=True) + with open(os.path.join(sd_path, fname), "w", encoding="utf-8") as f: + f.write(f"synthetic {sd}/{fname}\n") + + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}), + run_id="test-run-subdirs", + run_url="http://example.com/runsubdirs", + result_file=result_file, + test_type="unit-test", + start_timestamp="2026-04-28T15:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + trigger_reason="Manual", + ) + self.assertTrue(os.path.exists(result_file)) + with open(result_file, "r", encoding="utf-8") as f: + self.assertGreater(len(f.read()), 0) + # All three subdirs (mock logs/ + 2 synthetic) restored + # at original location with contents intact. + self.assertTrue(os.path.isdir(os.path.join(report_dir, "logs"))) + for sd, fname in extra_subdirs.items(): + self.assertTrue(os.path.isdir(os.path.join(report_dir, sd)), + f"{sd}/ missing after collect") + nested = os.path.join(report_dir, sd, fname) + self.assertTrue(os.path.isfile(nested), + f"{nested} missing after collect") + finally: + if os.path.exists(result_file): + os.remove(result_file) + class TestCollectMultiCluster(unittest.TestCase): """The multi-cluster aggregation invariant — the reason this scenario exists. @@ -339,7 +1405,34 @@ def test_configure_command_parsing(self, mock_configure): ] with patch.object(sys, "argv", test_args): main() - mock_configure.assert_called_once_with(2, 3, 4, "20m", "/tmp/overrides.yaml") + mock_configure.assert_called_once_with( + 2, 3, 4, "20m", "/tmp/overrides.yaml", + churn_cycles=5, + churn_up_duration="60s", + churn_down_duration="60s", + kill_duration="10m", + kill_interval_seconds=10, + kill_batch=5, + kill_duration_seconds=600, + kill_job_deadline_seconds=660, + apiserver_kill_target_context="clustermesh-1", + apiserver_kill_recovery_timeout_seconds=240, + apiserver_kill_observation_seconds=60, + ha_config_replicas=3, + node_churn_target_context="clustermesh-1", + node_churn_cycles=3, + node_churn_delta=5, + node_churn_settle_seconds=60, + node_churn_scale_duration_seconds=1800, + node_churn_replace_duration_seconds=1500, + node_churn_combined_duration_seconds=3300, + node_replace_batch_size=10, + node_churn_ready_timeout_seconds=300, + saturation_qps_list="100,500,1500,4000,10000", + saturation_restarts_list="2,4,8,15,25", + saturation_rung_duration_seconds=240, + saturation_settle_seconds=90, + ) @patch.object(clustermesh_scale_module, "execute_clusterloader2") def test_execute_command_parsing(self, mock_execute): @@ -363,6 +1456,7 @@ def test_execute_command_parsing(self, mock_execute): "config.yaml", "/path/to/kubeconfig", "aks", + tear_down_prometheus=False, ) @patch.object(clustermesh_scale_module, "collect_clusterloader2") @@ -403,7 +1497,1182 @@ def test_collect_command_parsing(self, mock_collect): 1, 1, "Manual", + churn_cycles=0, + churn_up_duration="", + churn_down_duration="", + kill_duration_seconds=0, + kill_interval_seconds=0, + kill_batch=0, + saturation_qps_list="", + saturation_restarts_list="", + ) + + @patch.object(clustermesh_scale_module, "execute_parallel") + def test_execute_parallel_command_parsing(self, mock_exec_parallel): + """`execute-parallel` subcommand wires CLI args through and exits with returned rc.""" + mock_exec_parallel.return_value = 0 + test_args = [ + "clustermesh-scale/scale.py", + "execute-parallel", + "--clusters", "/tmp/clusters.json", + "--max-concurrent", "3", + "--worker-script", "/path/to/run-cl2-on-cluster.sh", + "--cl2-image", "ghcr.io/azure/clusterloader2:v20250513", + "--cl2-config-dir", "/path/to/config", + "--cl2-config-file", "config.yaml", + "--cl2-report-dir-base", "/path/to/results", + "--provider", "aks", + "--python-script-file", "/path/to/scale.py", + "--python-workdir", "/path/to/modules/python", + ] + with patch.object(sys, "argv", test_args): + with self.assertRaises(SystemExit) as cm: + main() + self.assertEqual(cm.exception.code, 0) + mock_exec_parallel.assert_called_once_with( + clusters_file="/tmp/clusters.json", + max_concurrent=3, + worker_script="/path/to/run-cl2-on-cluster.sh", + cl2_image="ghcr.io/azure/clusterloader2:v20250513", + cl2_config_dir="/path/to/config", + cl2_config_file="config.yaml", + cl2_report_dir_base="/path/to/results", + provider="aks", + python_script_file="/path/to/scale.py", + python_workdir="/path/to/modules/python", + tear_down_prometheus=False, + ) + + @patch.object(clustermesh_scale_module, "execute_parallel") + def test_execute_parallel_default_max_concurrent_is_4(self, mock_exec_parallel): + """Default --max-concurrent matches the plan.md Phase 3 spec value (4).""" + mock_exec_parallel.return_value = 0 + test_args = [ + "clustermesh-scale/scale.py", + "execute-parallel", + "--clusters", "/tmp/c.json", + "--worker-script", "/w.sh", + "--cl2-image", "img", + "--cl2-config-dir", "/cfg", + "--cl2-config-file", "config.yaml", + "--cl2-report-dir-base", "/r", + "--provider", "aks", + "--python-script-file", "/s.py", + "--python-workdir", "/wd", + ] + with patch.object(sys, "argv", test_args): + with self.assertRaises(SystemExit): + main() + self.assertEqual(mock_exec_parallel.call_args.kwargs["max_concurrent"], 4) + + @patch.object(clustermesh_scale_module, "execute_parallel") + def test_execute_parallel_propagates_nonzero_exit(self, mock_exec_parallel): + """If execute_parallel returns nonzero, main() exits nonzero so the AzDO step fails.""" + mock_exec_parallel.return_value = 1 + test_args = [ + "clustermesh-scale/scale.py", + "execute-parallel", + "--clusters", "/tmp/c.json", + "--worker-script", "/w.sh", + "--cl2-image", "img", + "--cl2-config-dir", "/cfg", + "--cl2-config-file", "config.yaml", + "--cl2-report-dir-base", "/r", + "--provider", "aks", + "--python-script-file", "/s.py", + "--python-workdir", "/wd", + ] + with patch.object(sys, "argv", test_args): + with self.assertRaises(SystemExit) as cm: + main() + self.assertEqual(cm.exception.code, 1) + + @patch.object(clustermesh_scale_module, "execute_parallel") + def test_execute_parallel_tear_down_prometheus_flag(self, mock_exec_parallel): + """--tear-down-prometheus flag flows through to execute_parallel. + + Used by share-infra mode (multiple scenarios per provision/destroy + lifecycle) so each scenario's CL2 invocation deploys a fresh + Prometheus stack rather than colliding with the previous scenario's + leftover Prom resources. + """ + mock_exec_parallel.return_value = 0 + test_args_off = [ + "clustermesh-scale/scale.py", "execute-parallel", + "--clusters", "/tmp/c.json", "--worker-script", "/w.sh", + "--cl2-image", "img", "--cl2-config-dir", "/cfg", + "--cl2-config-file", "config.yaml", "--cl2-report-dir-base", "/r", + "--provider", "aks", "--python-script-file", "/s.py", "--python-workdir", "/wd", + ] + with patch.object(sys, "argv", test_args_off): + with self.assertRaises(SystemExit): + main() + self.assertEqual( + mock_exec_parallel.call_args.kwargs["tear_down_prometheus"], False) + + mock_exec_parallel.reset_mock() + with patch.object(sys, "argv", test_args_off + ["--tear-down-prometheus"]): + with self.assertRaises(SystemExit): + main() + self.assertEqual( + mock_exec_parallel.call_args.kwargs["tear_down_prometheus"], True) + + +class _FakePopen: + """Test double for subprocess.Popen used in execute_parallel tests. + + Records construction args, fakes a streamable stdout, sleeps inside wait() + to force temporal overlap (so concurrency tests can observe max_active), + and decrements an active counter on wait so the parent observes correct + in-flight counts. + + Class attributes (lock, counters, instances) are intentionally public — + the class itself is "private" via the leading underscore, and tests + inspect this state directly to assert concurrency invariants. + """ + + # Class-level state mutated across instances by the test runner. + lock = threading.Lock() + active_now = 0 + max_active = 0 + instances = [] # list of FakePopen instances created + wait_seconds = 0.05 # how long each fake CL2 "runs" in wait() + # Per-role configuration: role -> (stdout_lines, exit_code) + role_config = {} + default_exit = 0 + default_stdout = [] + + @classmethod + def reset(cls, *, wait_seconds=0.05, role_config=None, + default_stdout=None, default_exit=0): + cls.active_now = 0 + cls.max_active = 0 + cls.instances = [] + cls.wait_seconds = wait_seconds + cls.role_config = role_config or {} + cls.default_stdout = default_stdout or [] + cls.default_exit = default_exit + + def __init__(self, args, **kwargs): + # args is e.g. ["bash", worker_script, role, kubeconfig, ...] + self.args = args + self.kwargs = kwargs + self.returncode = None + self.role = args[2] if len(args) >= 3 else None + lines, exit_code = self.__class__.role_config.get( + self.role, (self.__class__.default_stdout, self.__class__.default_exit) + ) + # Provide an iterator over the staged lines so `for line in proc.stdout` + # in _run_one_cluster yields them once. + self.stdout = iter(lines) + self.exit_code = exit_code + with self.__class__.lock: + self.__class__.instances.append(self) + self.__class__.active_now += 1 + self.__class__.max_active = max( + self.__class__.max_active, self.__class__.active_now + ) + + def wait(self, timeout=None): # pylint: disable=unused-argument + # Sleep so peer workers have a chance to enter wait() concurrently. + # Without this overlap window, the test couldn't distinguish parallel + # execution from sequential. + time.sleep(self.__class__.wait_seconds) + with self.__class__.lock: + self.__class__.active_now -= 1 + self.returncode = self.exit_code + return self.exit_code + + def terminate(self): + # No-op for tests — execute_parallel only terminates on signal, + # which we don't trigger from these tests. + pass + + +class TestExecuteParallel(unittest.TestCase): + """execute_parallel fans out CL2 across N clusters with bounded concurrency. + + Validates the contract per plan.md Phase 3: bounded concurrent CL2 + invocations, per-cluster pass/fail aggregation, AzDO ##vso service + messages preserved without [role] prefix, sensible validation errors. + """ + + def setUp(self): + # Replace signal install with a no-op — installing real handlers in + # unit tests can interact badly with pytest's signal handling. + self._signal_patcher = patch.object( + clustermesh_scale_module, "_install_parallel_signal_handlers", lambda: None + ) + self._signal_patcher.start() + + def tearDown(self): + self._signal_patcher.stop() + + def _write_clusters(self, clusters): + path = tempfile.mktemp(suffix=".json") + with open(path, "w", encoding="utf-8") as f: + json.dump(clusters, f) + return path + + def _call_execute_parallel(self, clusters_file, max_concurrent=4): + return clustermesh_scale_module.execute_parallel( + clusters_file=clusters_file, + max_concurrent=max_concurrent, + worker_script="/path/to/run-cl2-on-cluster.sh", + cl2_image="img", + cl2_config_dir="/cfg", + cl2_config_file="config.yaml", + cl2_report_dir_base="/r", + provider="aks", + python_script_file="/scale.py", + python_workdir="/wd", + ) + + def test_dispatches_one_subprocess_per_cluster(self): + """N clusters → N Popen calls, each carrying that cluster's role + kubeconfig.""" + clusters = [ + {"role": "mesh-1", "kubeconfig": "/home/.kube/mesh-1.config"}, + {"role": "mesh-2", "kubeconfig": "/home/.kube/mesh-2.config"}, + {"role": "mesh-3", "kubeconfig": "/home/.kube/mesh-3.config"}, + ] + cf = self._write_clusters(clusters) + try: + _FakePopen.reset(wait_seconds=0) + with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen): + rc = self._call_execute_parallel(cf) + self.assertEqual(rc, 0) + self.assertEqual(len(_FakePopen.instances), 3) + # Each invocation passes role + kubeconfig in the bash worker arg + # vector. args layout: ["bash", worker_script, role, kubeconfig, + # report_dir, cl2_image, cl2_config_dir, cl2_config_file, provider, + # python_script_file, python_workdir] + roles_seen = {p.args[2] for p in _FakePopen.instances} + self.assertEqual(roles_seen, {"mesh-1", "mesh-2", "mesh-3"}) + for p in _FakePopen.instances: + role = p.args[2] + self.assertEqual(p.args[3], f"/home/.kube/{role}.config") + # report_dir is base/role + self.assertEqual(p.args[4], f"/r/{role}") + finally: + os.remove(cf) + + def test_all_zero_exit_codes_yield_overall_success(self): + """If every per-cluster worker exits 0, execute_parallel returns 0.""" + clusters = [ + {"role": "mesh-1", "kubeconfig": "/k1"}, + {"role": "mesh-2", "kubeconfig": "/k2"}, + ] + cf = self._write_clusters(clusters) + try: + _FakePopen.reset(wait_seconds=0, default_exit=0) + with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen): + rc = self._call_execute_parallel(cf) + self.assertEqual(rc, 0) + finally: + os.remove(cf) + + def test_any_nonzero_exit_yields_overall_failure(self): + """If ANY per-cluster worker exits non-zero, execute_parallel returns 1. + + Mirrors the sequential bash behavior (`if failures > 0; exit 1`) so + the AzDO step's pass/fail signal is unchanged from before parallel + fan-out. Other clusters still complete (no early cancellation). + """ + clusters = [ + {"role": "mesh-1", "kubeconfig": "/k1"}, + {"role": "mesh-2", "kubeconfig": "/k2"}, + {"role": "mesh-3", "kubeconfig": "/k3"}, + ] + cf = self._write_clusters(clusters) + try: + _FakePopen.reset( + wait_seconds=0, + role_config={ + "mesh-1": ([], 0), + "mesh-2": ([], 1), # this one fails + "mesh-3": ([], 0), + }, + ) + with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen): + rc = self._call_execute_parallel(cf) + self.assertEqual(rc, 1) + # All three workers ran — failure of one does NOT cancel the others. + self.assertEqual(len(_FakePopen.instances), 3) + finally: + os.remove(cf) + + def test_respects_max_concurrent_bound(self): + """No more than max_concurrent workers are in-flight simultaneously. + + Uses a barrier-free approach: each FakePopen sleeps in wait(); we + observe the running max_active count maintained inside FakePopen. + Asserts max_active <= max_concurrent regardless of timing — no + ordering or wall-clock assertion (which would be flaky under CI load). + """ + clusters = [{"role": f"mesh-{i}", "kubeconfig": f"/k{i}"} for i in range(8)] + cf = self._write_clusters(clusters) + try: + _FakePopen.reset(wait_seconds=0.05) # 50ms per "CL2 run" + with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen): + rc = self._call_execute_parallel(cf, max_concurrent=3) + self.assertEqual(rc, 0) + self.assertEqual(len(_FakePopen.instances), 8) + # The bound is the contract: never more than 3 concurrent CL2 + # docker containers from this orchestrator at once. + self.assertLessEqual(_FakePopen.max_active, 3) + # Sanity: with 8 work items and 50ms each, we WILL see >1 in + # flight — otherwise the test would pass trivially with a + # single-threaded executor. + self.assertGreater(_FakePopen.max_active, 1) + finally: + os.remove(cf) + + def test_prefixes_role_but_preserves_vso_service_messages(self): + """Worker stdout lines get [role] prefix; ##vso AzDO messages stay verbatim. + + AzDO recognizes ##vso[...] service messages only at column 0 — a + [role] prefix would silently drop the structured annotation + (warnings, errors, set-variable). Regression-guard: if the prefix + logic ever changes, this test breaks loudly. + """ + clusters = [{"role": "mesh-1", "kubeconfig": "/k1"}] + cf = self._write_clusters(clusters) + try: + _FakePopen.reset( + wait_seconds=0, + role_config={ + "mesh-1": ([ + "hello world\n", + "##vso[task.logissue type=warning;]something\n", + "more text\n", + ], 0), + }, + ) + buf = io.StringIO() + with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen): + with redirect_stdout(buf): + rc = self._call_execute_parallel(cf) + self.assertEqual(rc, 0) + captured = buf.getvalue() + # Non-vso lines are prefixed with [role]. + self.assertIn("[mesh-1] hello world", captured) + self.assertIn("[mesh-1] more text", captured) + # vso line MUST NOT be prefixed. + self.assertIn("##vso[task.logissue type=warning;]something", captured) + self.assertNotIn("[mesh-1] ##vso", captured) + finally: + os.remove(cf) + + def test_empty_clusters_file_raises(self): + """A clusters file with [] is invalid — fail fast, don't silently no-op.""" + cf = self._write_clusters([]) + try: + with self.assertRaises(ValueError): + self._call_execute_parallel(cf) + finally: + os.remove(cf) + + def test_cluster_missing_kubeconfig_raises(self): + """Each cluster object must carry both 'role' and 'kubeconfig'.""" + cf = self._write_clusters([{"role": "mesh-1"}]) + try: + with self.assertRaises(ValueError): + self._call_execute_parallel(cf) + finally: + os.remove(cf) + + def test_max_concurrent_zero_raises(self): + """max_concurrent < 1 is meaningless and would deadlock the executor.""" + cf = self._write_clusters([{"role": "mesh-1", "kubeconfig": "/k1"}]) + try: + with self.assertRaises(ValueError): + self._call_execute_parallel(cf, max_concurrent=0) + finally: + os.remove(cf) + + def test_extra_fields_in_cluster_object_are_ignored(self): + """Pipeline writes name/rg/kubeconfig/role; execute_parallel must tolerate extras. + + Same JSON file is consumed by collect.yml (which uses name/rg/role), + so execute_parallel must NOT reject the additional fields. + """ + clusters = [ + {"role": "mesh-1", "kubeconfig": "/k1", "name": "aks-1", "rg": "rg-1"}, + {"role": "mesh-2", "kubeconfig": "/k2", "name": "aks-2", "rg": "rg-2"}, + ] + cf = self._write_clusters(clusters) + try: + _FakePopen.reset(wait_seconds=0) + with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen): + rc = self._call_execute_parallel(cf) + self.assertEqual(rc, 0) + self.assertEqual(len(_FakePopen.instances), 2) + finally: + os.remove(cf) + + +# ============================================================================ +# Phase 4b — Scenario #6 (Upper Bound / Saturation) tests +# ============================================================================ + + +SATURATION_THRESHOLDS = clustermesh_scale_module.SATURATION_THRESHOLDS +SATURATION_CLASSIFIER_VERSION = clustermesh_scale_module.SATURATION_CLASSIFIER_VERSION + + +def _write_metric_file(report_dir, metric_name, suffix, metrics, fmt="prod", shape="cl2"): + """Write a CL2-shaped GenericPrometheusQuery JSON. + + Two AXES of variation: + + **Filename format** (`fmt`): + "prod" — build 67211+ production filename format: + `GenericPrometheusQuery __.json` + "compact" — legacy/mock filename with no spaces: + `GenericPrometheusQuery___.json` + + **Content shape** (`shape`): + "cl2" — build 67224 verified — one dataItem with named metric keys + in `data`, scalar values: + {"dataItems": [{"data": {"Max": 0, "Perc99": 0.5}, "unit": "#"}]} + "labels" — legacy / PodStartupLatency-style — one dataItem per + metric label, with `data.value` carrying the scalar: + {"dataItems": [{"labels": {"Metric": "Perc99"}, "data": {"value": 0.5}}]} + + Defaults to fmt="prod", shape="cl2" — what real CL2 emits today. + """ + if fmt == "prod": + fname = ( + f"GenericPrometheusQuery {metric_name} {suffix}_" + f"saturation-test_2026-05-14T00:00:00Z.json" + ) + elif fmt == "compact": + compact = metric_name.replace(" ", "") + fname = ( + f"GenericPrometheusQuery_{compact}{suffix}_" + f"saturation-test_2026-05-14T00:00:00Z.json" + ) + else: + raise ValueError(f"unknown fmt: {fmt!r}") + if shape == "cl2": + data_items = [{"data": dict(metrics), "unit": "#"}] + elif shape == "labels": + data_items = [ + {"labels": {"Metric": label}, "data": {"value": value}} + for label, value in metrics.items() + ] + else: + raise ValueError(f"unknown shape: {shape!r}") + path = os.path.join(report_dir, fname) + with open(path, "w", encoding="utf-8") as f: + json.dump({"version": "v1", "dataItems": data_items}, f) + return path + + +class TestConfigureSaturationKnobs(unittest.TestCase): + """Phase 4b — Scenario #6 saturation overrides flow through + configure_clusterloader2 and land in the CL2 overrides file with the + expected CL2_SATURATION_* keys. + """ + + def test_saturation_defaults_emitted(self): + with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn('CL2_SATURATION_QPS_LIST: "100,500,1500,4000,10000"', content) + self.assertIn('CL2_SATURATION_RESTARTS_LIST: "2,4,8,15,25"', content) + self.assertIn("CL2_SATURATION_RUNG_DURATION_SECONDS: 240", content) + self.assertIn("CL2_SATURATION_SETTLE_SECONDS: 90", content) + finally: + os.remove(tmp_path) + + def test_saturation_overrides_passthrough(self): + with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="15m", + override_file=tmp_path, + saturation_qps_list="50,100,200,400,800", + saturation_restarts_list="1,1,2,3,5", + saturation_rung_duration_seconds=240, + saturation_settle_seconds=90, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + self.assertIn('CL2_SATURATION_QPS_LIST: "50,100,200,400,800"', content) + self.assertIn('CL2_SATURATION_RESTARTS_LIST: "1,1,2,3,5"', content) + self.assertIn("CL2_SATURATION_RUNG_DURATION_SECONDS: 240", content) + self.assertIn("CL2_SATURATION_SETTLE_SECONDS: 90", content) + finally: + os.remove(tmp_path) + + def test_saturation_classifier_constants_exposed(self): + """SATURATION_THRESHOLDS + SATURATION_CLASSIFIER_VERSION must be + importable so dashboards (and these tests) can reference them. If + the schema changes, the version string must change too.""" + self.assertEqual(SATURATION_CLASSIFIER_VERSION, "saturation-v1") + for k in ( + "latency_p99_ms", "queue_size_perc99", "apiserver_max_cpu_cores", + "mesh_failure_rate_max", "etcd_commit_p99_ms", + ): + self.assertIn(k, SATURATION_THRESHOLDS) + self.assertGreater(SATURATION_THRESHOLDS[k], 0) + + +class TestSaturationClassifier(unittest.TestCase): + """Phase 4b — Scenario #6 classifier emits per-rung verdicts + + per-cluster summary rows. Synthetic per-rung mock data exercises + each verdict path. + """ + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + self.report_dir = os.path.join(self.tmpdir, "mesh-1") + shutil.copytree(os.path.join(MOCK_REPORT_ROOT, "mesh-1"), self.report_dir) + self.result_file = tempfile.mktemp(suffix=".jsonl") + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + if os.path.exists(self.result_file): + os.remove(self.result_file) + + def _write_clean_rung(self, rung): + suffix = f"Rung{rung}" + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc99": 0.020}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 5, "Perc99": 3}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.01}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc99": 0.005}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc99": 15}, + ) + + def _write_latency_tripped_rung(self, rung): + suffix = f"Rung{rung}" + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc99": 0.900}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 10, "Perc99": 5}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"PerPodMax": 0.4, "TotalMax": 0.4, "TotalAvg": 0.3}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.02}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc99": 0.010}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc99": 50}, + ) + + def _write_queue_unbounded_rung(self, rung): + suffix = f"Rung{rung}" + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc99": 0.100}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 8000, "Perc99": 5000}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"PerPodMax": 0.5, "TotalMax": 0.5, "TotalAvg": 0.4}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.02}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc99": 0.020}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc99": 200}, + ) + + def _write_cpu_exhaust_rung(self, rung): + suffix = f"Rung{rung}" + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc99": 0.200}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 50, "Perc99": 30}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"PerPodMax": 2.5, "TotalMax": 2.5, "TotalAvg": 2.0}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.05}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc99": 0.050}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc99": 80}, + ) + + def _run_collect(self, qps_list, restarts_list=None): + if restarts_list is None: + restarts_list = ",".join(["1"] * len(qps_list.split(","))) + collect_clusterloader2( + cl2_report_dir=self.report_dir, + cloud_info="", + run_id="sat-test", + run_url="", + result_file=self.result_file, + test_type="upper-bound", + start_timestamp="2026-05-14T00:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + saturation_qps_list=qps_list, + saturation_restarts_list=restarts_list, + ) + with open(self.result_file, "r", encoding="utf-8") as f: + return [json.loads(l) for l in f.read().strip().split("\n") if l] + + def test_classifier_no_op_when_qps_list_empty(self): + """Non-upper-bound runs leave saturation_qps_list empty → no + SaturationRung / SaturationSummary rows.""" + collect_clusterloader2( + cl2_report_dir=self.report_dir, + cloud_info="", + run_id="sat-noop", + run_url="", + result_file=self.result_file, + test_type="event-throughput", + start_timestamp="2026-05-14T00:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + ) + with open(self.result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + rungs = [r for r in lines if r.get("measurement") == "SaturationRung"] + summaries = [r for r in lines if r.get("measurement") == "SaturationSummary"] + self.assertEqual(len(rungs), 0) + self.assertEqual(len(summaries), 0) + + def test_all_clean_rungs_max_clean_qps_is_highest(self): + for r in range(3): + self._write_clean_rung(r) + lines = self._run_collect("20,40,80") + rungs = sorted( + [r for r in lines if r.get("measurement") == "SaturationRung"], + key=lambda r: r["result"]["data"]["rung_index"], + ) + summary = [r for r in lines if r.get("measurement") == "SaturationSummary"] + self.assertEqual(len(rungs), 3) + self.assertEqual(len(summary), 1) + for r in rungs: + self.assertEqual(r["result"]["data"]["verdict"], "clean") + self.assertTrue(r["result"]["data"]["rung_completed"]) + self.assertEqual(r["result"]["data"]["measurement_missing"], []) + s = summary[0]["result"]["data"] + self.assertEqual(s["max_clean_qps"], 80) + self.assertEqual(s["rungs_completed"], 3) + self.assertEqual(s["rungs_configured"], 3) + self.assertIsNone(s["first_failure_rung_index"]) + self.assertIsNone(s["first_failure_mode"]) + self.assertEqual(s["classifier_version"], SATURATION_CLASSIFIER_VERSION) + + def test_latency_spike_verdict(self): + self._write_clean_rung(0) + self._write_latency_tripped_rung(1) + lines = self._run_collect("20,40") + rungs = sorted( + [r for r in lines if r.get("measurement") == "SaturationRung"], + key=lambda r: r["result"]["data"]["rung_index"], + ) + self.assertEqual(rungs[0]["result"]["data"]["verdict"], "clean") + self.assertEqual(rungs[1]["result"]["data"]["verdict"], "latency_spike") + self.assertAlmostEqual( + rungs[1]["result"]["data"]["dominant_signal_ratio"], 1.8, places=2, + ) + summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0] + s = summary["result"]["data"] + self.assertEqual(s["max_clean_qps"], 20) + self.assertEqual(s["first_failure_rung_index"], 1) + self.assertEqual(s["first_failure_qps"], 40) + self.assertEqual(s["first_failure_mode"], "latency_spike") + self.assertIsNone(s["second_failure_mode"]) + + def test_queue_unbounded_verdict(self): + self._write_clean_rung(0) + self._write_queue_unbounded_rung(1) + lines = self._run_collect("20,40") + rung1 = next( + r for r in lines + if r.get("measurement") == "SaturationRung" + and r["result"]["data"]["rung_index"] == 1 + ) + self.assertEqual(rung1["result"]["data"]["verdict"], "queue_unbounded") + self.assertAlmostEqual( + rung1["result"]["data"]["dominant_signal_ratio"], 5.0, places=2, + ) + + def test_cpu_exhaust_verdict(self): + self._write_clean_rung(0) + self._write_cpu_exhaust_rung(1) + lines = self._run_collect("20,40") + rung1 = next( + r for r in lines + if r.get("measurement") == "SaturationRung" + and r["result"]["data"]["rung_index"] == 1 + ) + self.assertEqual(rung1["result"]["data"]["verdict"], "cpu_exhaust") + self.assertAlmostEqual( + rung1["result"]["data"]["dominant_signal_ratio"], 2.5 / 1.5, + places=2, + ) + + def test_second_failure_mode_tracking(self): + """Rung 0 clean, rung 1 latency, rung 2 cpu_exhaust → first=latency_spike, + second=cpu_exhaust. Same-mode subsequent failures don't overwrite second.""" + self._write_clean_rung(0) + self._write_latency_tripped_rung(1) + self._write_cpu_exhaust_rung(2) + lines = self._run_collect("20,40,80") + summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0] + s = summary["result"]["data"] + self.assertEqual(s["first_failure_mode"], "latency_spike") + self.assertEqual(s["second_failure_mode"], "cpu_exhaust") + self.assertEqual(s["first_failure_qps"], 40) + + def test_max_clean_qps_is_contiguous_prefix(self): + """If a non-clean rung lands then a later 'clean' rung shows up, + max_clean_qps does NOT extend past the first failure.""" + self._write_clean_rung(0) + self._write_clean_rung(1) + self._write_latency_tripped_rung(2) + self._write_clean_rung(3) + lines = self._run_collect("20,40,80,160") + summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0] + s = summary["result"]["data"] + self.assertEqual(s["max_clean_qps"], 40) + self.assertEqual(s["first_failure_rung_index"], 2) + self.assertEqual(s["first_failure_mode"], "latency_spike") + + def test_missing_measurements_flag_incomplete_rung(self): + """If a rung's measurement files are missing, measurement_missing + lists the gaps. Latency present → rung_completed still true.""" + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + "Rung0", {"Perc99": 0.020}, + ) + lines = self._run_collect("20") + rung = next(r for r in lines if r.get("measurement") == "SaturationRung") + d = rung["result"]["data"] + self.assertTrue(d["rung_completed"]) + self.assertIn("queue_size_perc99", d["measurement_missing"]) + self.assertIn("apiserver_max_cpu_cores", d["measurement_missing"]) + self.assertIn("mesh_failure_rate_max", d["measurement_missing"]) + self.assertIn("etcd_commit_p99_ms", d["measurement_missing"]) + + def test_rung_completed_false_when_latency_missing(self): + """Latency is the gating signal — without it, rung is incomplete + regardless of how many other signals landed.""" + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + "Rung0", {"Max": 5, "Perc99": 3}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + "Rung0", {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + "Rung0", {"Max": 0.01}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + "Rung0", {"Perc99": 0.005}, + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + "Rung0", {"Perc99": 15}, + ) + lines = self._run_collect("20") + rung = next(r for r in lines if r.get("measurement") == "SaturationRung") + self.assertFalse(rung["result"]["data"]["rung_completed"]) + self.assertIn("latency_p99_ms", rung["result"]["data"]["measurement_missing"]) + summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0] + self.assertEqual(summary["result"]["data"]["rungs_completed"], 0) + + def test_summary_carries_classifier_metadata(self): + """SaturationSummary records classifier_version + thresholds so + dashboards can recompute verdicts post-hoc.""" + self._write_clean_rung(0) + lines = self._run_collect("20") + summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0] + s = summary["result"]["data"] + self.assertEqual(s["classifier_version"], SATURATION_CLASSIFIER_VERSION) + self.assertEqual(s["thresholds"], SATURATION_THRESHOLDS) + self.assertEqual(s["configured_qps_list"], [20]) + self.assertEqual(s["configured_restarts_list"], [1]) + + def test_rung_row_carries_raw_signal_values(self): + """SaturationRung records raw signal values + all per-criterion + ratios so the classifier can be re-run post-hoc at different + thresholds without re-collecting from CL2.""" + self._write_latency_tripped_rung(0) + lines = self._run_collect("20") + rung = next(r for r in lines if r.get("measurement") == "SaturationRung") + d = rung["result"]["data"] + self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 900.0, places=1) + self.assertAlmostEqual(d["signals"]["apiserver_max_cpu_cores"], 0.4, places=2) + self.assertIn("latency_spike", d["all_verdicts"]) + self.assertIn("cpu_exhaust", d["all_verdicts"]) + + def test_malformed_qps_list_skips_classifier_gracefully(self): + """Malformed CL2_SATURATION_QPS_LIST should not crash collect; the + classifier logs a warning and emits zero saturation rows.""" + self._write_latency_tripped_rung(0) + collect_clusterloader2( + cl2_report_dir=self.report_dir, + cloud_info="", + run_id="sat-malformed", + run_url="", + result_file=self.result_file, + test_type="upper-bound", + start_timestamp="2026-05-14T00:00:00Z", + cluster_name="mesh-1", + cluster_count=2, + mesh_size=2, + namespaces=5, + deployments_per_namespace=4, + replicas_per_deployment=10, + trigger_reason="Manual", + saturation_qps_list="20,not-a-number,80", + saturation_restarts_list="1,2,3", + ) + with open(self.result_file, "r", encoding="utf-8") as f: + lines = [json.loads(l) for l in f.read().strip().split("\n") if l] + rungs = [r for r in lines if r.get("measurement") == "SaturationRung"] + summaries = [r for r in lines if r.get("measurement") == "SaturationSummary"] + self.assertEqual(len(rungs), 0) + self.assertEqual(len(summaries), 0) + + def test_restarts_list_padded_when_shorter_than_qps(self): + """If restarts_list is shorter than qps_list, missing entries + default to 1 so the classifier doesn't crash.""" + self._write_clean_rung(0) + self._write_clean_rung(1) + self._write_clean_rung(2) + lines = self._run_collect("20,40,80", restarts_list="1,2") + rungs = sorted( + [r for r in lines if r.get("measurement") == "SaturationRung"], + key=lambda r: r["result"]["data"]["rung_index"], + ) + self.assertEqual(rungs[0]["result"]["data"]["configured_restarts"], 1) + self.assertEqual(rungs[1]["result"]["data"]["configured_restarts"], 2) + self.assertEqual(rungs[2]["result"]["data"]["configured_restarts"], 1) + + def test_monitoring_oom_verdict_when_prom_dies_mid_run(self): + """Phase 4b — Scenario #6 monitoring_oom verdict (added 2026-05-15 + after build 67279). When an earlier rung successfully completed but + a later rung has zero signals, the most likely explanation is the + Prometheus stack OOM'ed under load. That IS a saturation finding + per spec line 113 ('Resource exhaustion occurs') so we record it + as verdict=monitoring_oom rather than silently leaving it as + verdict=clean rung_completed=False (which underclaims the failure). + """ + # Rung 0: clean (Prom alive, all signals land) + self._write_clean_rung(0) + # Rung 1: NOTHING — Prom crashed mid-run before its gather phase + # (no files written for this rung). Classifier should detect + # "previous rung had signals, this one doesn't → monitoring_oom". + lines = self._run_collect("20,40") + rungs = sorted( + [r for r in lines if r.get("measurement") == "SaturationRung"], + key=lambda r: r["result"]["data"]["rung_index"], + ) + self.assertEqual(rungs[0]["result"]["data"]["verdict"], "clean") + self.assertEqual(rungs[1]["result"]["data"]["verdict"], "monitoring_oom") + self.assertEqual(rungs[1]["result"]["data"]["dominant_signal_ratio"], 999.0) + self.assertFalse(rungs[1]["result"]["data"]["rung_completed"]) + # Summary records monitoring_oom as the first failure mode. + summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0] + s = summary["result"]["data"] + self.assertEqual(s["max_clean_qps"], 20) + self.assertEqual(s["first_failure_mode"], "monitoring_oom") + self.assertEqual(s["first_failure_qps"], 40) + + def test_monitoring_oom_not_emitted_when_no_prior_rung_completed(self): + """If even Rung 0 has zero signals, that's NOT monitoring_oom — + it's an upstream config / deployment problem (Prom never came up, + or scale.py was misconfigured). Stay at verdict=clean + rung_completed=False so postmortem investigates the right layer.""" + # Don't write any files. Every rung will have zero signals. + lines = self._run_collect("20,40") + rungs = sorted( + [r for r in lines if r.get("measurement") == "SaturationRung"], + key=lambda r: r["result"]["data"]["rung_index"], + ) + # Both rungs should be clean (not monitoring_oom) because no + # earlier rung established that Prom WAS working. + for r in rungs: + self.assertNotEqual(r["result"]["data"]["verdict"], "monitoring_oom", + f"rung {r['result']['data']['rung_index']}: " + f"monitoring_oom should only fire after a " + f"prior rung completed") + self.assertEqual(r["result"]["data"]["verdict"], "clean") + self.assertFalse(r["result"]["data"]["rung_completed"]) + + def test_classifier_matches_build_67211_production_filename_format(self): + """REGRESSION: build 67211 (first n=2 upper-bound smoke 2026-05-14) + emitted measurement files in the format + 'GenericPrometheusQuery __.json' + but the classifier was matching the legacy compact format + 'GenericPrometheusQuery___.json' + → 0 files found, all 4 rungs classified as `clean` with 0 signals + despite all 20 signal files (5 signals × 4 rungs) being present on + disk. This test pins the production format so a future regression + fails locally instead of silently in CI. + """ + # Use fmt="prod" — production format with spaces. Default in + # _write_metric_file is also "prod" but explicit here for clarity. + suffix = "Rung0" + # Latency: 600ms p99 (above 500ms threshold) → should trip latency_spike + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc99": 0.600}, fmt="prod", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 50, "Perc99": 30}, fmt="prod", + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"PerPodMax": 0.5, "TotalMax": 0.5, "TotalAvg": 0.4}, + fmt="prod", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.05}, fmt="prod", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc99": 0.020}, fmt="prod", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc99": 30}, fmt="prod", + ) + # Verify the file on disk matches the build-67211 pattern exactly. + on_disk = sorted(os.listdir(self.report_dir)) + prod_pattern_files = [ + f for f in on_disk + if f.startswith("GenericPrometheusQuery ClusterMesh ") + and "Rung0_" in f + ] + self.assertGreaterEqual( + len(prod_pattern_files), 6, + f"production-format files not on disk; got: {prod_pattern_files}", + ) + lines = self._run_collect("20") + rung = next(r for r in lines if r.get("measurement") == "SaturationRung") + d = rung["result"]["data"] + # Classifier must FIND the files (production format) and apply the + # verdict. Pre-fix: all signals would be `None`, verdict=`clean`, + # rung_completed=False. Post-fix: latency value lands → latency_spike. + self.assertTrue(d["rung_completed"], + f"rung must be completed; missing={d['measurement_missing']}") + self.assertEqual(d["measurement_missing"], [], + f"all 7 signals should land; missing={d['measurement_missing']}") + self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 600.0, places=1) + self.assertEqual(d["verdict"], "latency_spike") + + def test_classifier_accepts_legacy_compact_filename_format(self): + """The classifier supports BOTH production (space) and legacy + (compact-underscore) filename formats so test mocks/older CL2 + emissions don't silently fail. Pin both with this test.""" + suffix = "Rung0" + # Write the same set in COMPACT format (no spaces, underscore after + # GenericPrometheusQuery). + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc99": 0.020}, fmt="compact", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 5, "Perc99": 3}, fmt="compact", + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2}, + fmt="compact", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.01}, fmt="compact", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc99": 0.005}, fmt="compact", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc99": 15}, fmt="compact", + ) + lines = self._run_collect("20") + rung = next(r for r in lines if r.get("measurement") == "SaturationRung") + d = rung["result"]["data"] + self.assertTrue(d["rung_completed"]) + self.assertEqual(d["verdict"], "clean") + self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 20.0, places=1) + + def test_classifier_reads_build_67224_cl2_content_shape(self): + """REGRESSION: build 67224 (2nd n=2 upper-bound smoke 2026-05-15) + emitted measurement file content in the CL2 GenericPrometheusQuery + shape — one dataItem with query results as named keys in `data`: + {"dataItems": [{"data": {"Max": 0, "Perc99": 0.5}, "unit": "#"}]} + not the legacy labels shape + {"dataItems": [{"labels": {"Metric": "Perc99"}, "data": {"value": 0.5}}]} + The classifier was reading via labels.Metric, missing every value. + Pin BOTH content shapes here so the bug can't regress. + """ + # shape="cl2" mirrors the actual on-disk content from build 67224. + suffix = "Rung0" + # Latency 600ms p99 (above 500ms threshold) → should trip latency_spike + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc50": 0.020, "Perc90": 0.300, "Perc99": 0.600}, + fmt="prod", shape="cl2", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 50, "Perc50": 10, "Perc99": 30}, + fmt="prod", shape="cl2", + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"TotalMax": 0.5, "TotalAvg": 0.3, "PerPodMax": 0.5}, + fmt="prod", shape="cl2", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.05, "Perc50": 0.01}, + fmt="prod", shape="cl2", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc50": 0.003, "Perc90": 0.005, "Perc99": 0.020}, + fmt="prod", shape="cl2", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc50": 0, "Perc90": 5, "Perc99": 30, "TotalIncrease": 3000}, + fmt="prod", shape="cl2", + ) + lines = self._run_collect("20") + rung = next(r for r in lines if r.get("measurement") == "SaturationRung") + d = rung["result"]["data"] + # Pre-fix (build 67224): all signals returned None → verdict=clean + # rung_completed=False signals_found=0/7. Post-fix: every signal + # lands, latency trips threshold. + self.assertTrue(d["rung_completed"], + f"rung must be completed; missing={d['measurement_missing']}") + self.assertEqual(d["measurement_missing"], [], + f"all 7 signals should land; missing={d['measurement_missing']}") + self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 600.0, places=1) + self.assertAlmostEqual(d["signals"]["queue_size_perc99"], 30.0, places=1) + self.assertAlmostEqual(d["signals"]["apiserver_max_cpu_cores"], 0.5, places=2) + self.assertAlmostEqual(d["signals"]["mesh_failure_rate_max"], 0.05, places=3) + self.assertEqual(d["verdict"], "latency_spike") + + def test_classifier_reads_legacy_labels_content_shape(self): + """Backward-compat: even though build 67224 uses the cl2 shape, + legacy mocks (and PodStartupLatency-format files) use a + per-metric-labels shape. The classifier must still read those so + existing mock fixtures don't break.""" + suffix = "Rung0" + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Operation Duration", + suffix, {"Perc99": 0.020}, fmt="prod", shape="labels", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Sync Queue Size", + suffix, {"Max": 5, "Perc99": 3}, fmt="prod", shape="labels", + ) + _write_metric_file( + self.report_dir, "ClusterMesh APIServer Pod CPU", + suffix, {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2}, + fmt="prod", shape="labels", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Remote Cluster Failure Rate", + suffix, {"Max": 0.01}, fmt="prod", shape="labels", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Etcd Backend Write Duration", + suffix, {"Perc99": 0.005}, fmt="prod", shape="labels", + ) + _write_metric_file( + self.report_dir, "ClusterMesh Kvstore Events Rate", + suffix, {"Perc99": 15}, fmt="prod", shape="labels", ) + lines = self._run_collect("20") + rung = next(r for r in lines if r.get("measurement") == "SaturationRung") + d = rung["result"]["data"] + self.assertTrue(d["rung_completed"]) + self.assertEqual(d["verdict"], "clean") + self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 20.0, places=1) + self.assertAlmostEqual(d["signals"]["queue_size_perc99"], 3.0, places=1) if __name__ == "__main__": diff --git a/modules/terraform/azure/aks-cli/main.tf b/modules/terraform/azure/aks-cli/main.tf index 687ca04e5b..2cf3016845 100644 --- a/modules/terraform/azure/aks-cli/main.tf +++ b/modules/terraform/azure/aks-cli/main.tf @@ -11,6 +11,41 @@ locals { pool.name => pool } + # Pre-built `az aks nodepool add` command per extra pool. Pulled into a + # local so the terraform_data.aks_nodepool_cli heredoc body stays readable + # (avoids a multi-line interpolation inside the bash retry-loop heredoc, + # which `terraform fmt` otherwise mangles). + extra_pool_commands = { + for pool in var.aks_cli_config.extra_node_pool : pool.name => join(" ", [ + "az", + "aks", + "nodepool", + "add", + "-g", var.resource_group_name, + "--cluster-name", var.aks_cli_config.aks_name, + "--nodepool-name", pool.name, + "--node-count", pool.node_count, + "--node-vm-size", pool.vm_size, + "--vm-set-type", pool.vm_set_type, + "--node-osdisk-type", pool.os_disk_type, + local.aks_custom_headers_flags, + # If the default pool uses --pod-subnet-id (Azure CNI dynamic IP + # allocation), AKS requires ALL agent pools to set it (or none). + # Without this, `az aks nodepool add` on extra pools fails with + # `InvalidParameter: All or none of the agentpools should set + # podsubnet`. Reuse the same pod subnet as the default pool — extra + # pools (e.g. prompool) host non-workload pods so the per-pool pod + # IP separation isn't meaningful here. + local.pod_subnet_id_parameter, + length(pool.optional_parameters) == 0 ? + "" : + join(" ", [ + for param in pool.optional_parameters : + format("--%s %s", param.name, param.value) + ]), + ]) + } + key_management_service = ( var.aks_cli_config.kms_config != null ) ? { @@ -333,34 +368,111 @@ resource "terraform_data" "aks_cli" { } } +# Gate any subsequent `az aks ...` operations (extra node pools, post-create +# updates) on the cluster reaching a stable provisioningState=Succeeded. +# +# Why this exists: `az aks create --enable-acns` (and similar addon flags +# like --enable-azure-monitor-metrics) kicks off a PutExtensionAddonHandler +# PUT operation that runs ASYNCHRONOUSLY after `az aks create` returns. While +# that operation is in flight, any downstream `az aks nodepool add` (e.g. our +# extra_node_pool / prompool) fails with: +# ERROR: (OperationNotAllowed) Operation is not allowed because there's an +# in progress PutExtensionAddonHandler.PUT operation ... Please wait for it +# to finish before starting a new operation. +# The race is timing-dependent and rarely manifests with 1-2 concurrent +# cluster creates, but is deterministic at N>=5 (regional AKS RP queues the +# extension installs and the slowest cluster's PUT lags `az aks create` return +# by several minutes — observed in the clustermesh-scale n5 tier). +# +# Polling logic: require 3 consecutive Succeeded readings 20s apart, with a +# 60s initial buffer so any queued extension install has time to transition +# the cluster into Updating. The consecutive requirement defends against the +# brief Succeeded window between create-finish and extension-start. Total +# budget ~20m. +resource "terraform_data" "aks_wait_succeeded" { + count = var.aks_cli_config.dry_run ? 0 : 1 + + depends_on = [terraform_data.aks_cli] + + input = { + resource_group_name = var.resource_group_name + aks_name = var.aks_cli_config.aks_name + } + + provisioner "local-exec" { + # local-exec defaults to /bin/sh which on Ubuntu agents is dash; dash + # rejects `set -o pipefail` (bash-only). Explicitly select bash so the + # script's safety options work as written. + interpreter = ["bash", "-c"] + command = <<-EOT + set -eo pipefail + rg="${self.input.resource_group_name}" + name="${self.input.aks_name}" + echo "Waiting for AKS $name to reach a stable Succeeded state..." + sleep 60 + required=3 + got=0 + for i in $(seq 1 60); do + state=$(az aks show -g "$rg" -n "$name" --query provisioningState -o tsv 2>/dev/null || echo "Unknown") + if [ "$state" = "Succeeded" ]; then + got=$((got + 1)) + if [ "$got" -ge "$required" ]; then + echo "AKS $name stable in Succeeded ($got consecutive checks). Continuing." + exit 0 + fi + else + if [ "$got" -gt 0 ]; then + echo "AKS $name re-entered '$state' after Succeeded streak; resetting counter" + fi + got=0 + fi + echo "AKS $name provisioningState=$state (Succeeded streak=$got/$required)" + sleep 20 + done + echo "Timeout: AKS $name did not reach sustained Succeeded after ~20m" + exit 1 + EOT + } +} + resource "terraform_data" "aks_nodepool_cli" { depends_on = [ - terraform_data.aks_cli + terraform_data.aks_cli, + terraform_data.aks_wait_succeeded, ] for_each = local.extra_pool_map + # Wrap the underlying `az aks nodepool add` (built in locals.extra_pool_commands) + # in a bash retry loop that handles the OperationNotAllowed / AnotherOperationInProgress + # AKS RP race window. Even with terraform_data.aks_wait_succeeded gating + # this on a stable cluster Succeeded state, the AKS RP can lazily start + # post-create extension PUTs (e.g. --enable-acns) AFTER the wait exits — + # observed at N>=5 cluster create concurrency where the regional RP queues + # addon installs minutes behind the parent cluster create. The retry catches + # that race; keeping the wait avoids noisy first-attempt failures in the + # common (non-lazy) case. 30 retries * 30s = 15min budget. provisioner "local-exec" { - command = join(" ", [ - "az", - "aks", - "nodepool", - "add", - "-g", var.resource_group_name, - "--cluster-name", var.aks_cli_config.aks_name, - "--nodepool-name", each.value.name, - "--node-count", each.value.node_count, - "--node-vm-size", each.value.vm_size, - "--vm-set-type", each.value.vm_set_type, - "--node-osdisk-type", each.value.os_disk_type, - local.aks_custom_headers_flags, - length(each.value.optional_parameters) == 0 ? - "" : - join(" ", [ - for param in each.value.optional_parameters : - format("--%s %s", param.name, param.value) - ]), - ]) + interpreter = ["bash", "-c"] + command = <<-EOT + set -eo pipefail + cmd=${jsonencode(local.extra_pool_commands[each.key])} + pool="${each.value.name}" + cluster="${var.aks_cli_config.aks_name}" + for i in $(seq 1 30); do + out=$(eval "$cmd" 2>&1) && { echo "$out"; exit 0; } + if echo "$out" | grep -qE "OperationNotAllowed|AnotherOperationInProgress"; then + echo "[retry $i/30] $cluster nodepool $pool create blocked by in-progress AKS RP operation; sleeping 30s" + sleep 30 + continue + fi + # Some other failure (quota, invalid args, etc.) — fail fast. + echo "$out" >&2 + exit 1 + done + echo "Timeout: $cluster nodepool $pool create still blocked after 30 retries (~15m)" >&2 + exit 1 + EOT } } diff --git a/modules/terraform/azure/aks-cli/variables.tf b/modules/terraform/azure/aks-cli/variables.tf index 3fb1c427f1..2a0384c03b 100644 --- a/modules/terraform/azure/aks-cli/variables.tf +++ b/modules/terraform/azure/aks-cli/variables.tf @@ -73,10 +73,20 @@ variable "bootstrap_container_registry_resource_id" { variable "aks_cli_config" { type = object({ - role = string - aks_name = string - sku_tier = string - subnet_name = optional(string, null) + role = string + aks_name = string + sku_tier = string + subnet_name = optional(string, null) + # Pod subnet for Azure CNI dynamic IP allocation (--pod-subnet-id). + # When set, AKS pulls pod IPs from this subnet instead of co-tenanting + # them on the node subnet (legacy CNI). Required at scale since legacy + # mode pre-allocates `1 + max-pods` IPs per node on the node subnet — + # at 20 nodes × max-pods=110 that's 2,220 IPs, vastly exceeding a typical + # /24 node subnet. The aks-cli main.tf reads this via local.pod_subnet_id + # and emits --pod-subnet-id when non-null. Originally referenced in + # main.tf without being declared here — silently fell back to legacy + # CNI for ALL callers regardless of tfvars. Added 2026-05-09. + pod_subnet_name = optional(string, null) managed_identity_name = optional(string, null) kubernetes_version = optional(string, null) aks_custom_headers = optional(list(string), []) diff --git a/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml index caaedc0ea0..e7dabb189f 100644 --- a/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml +++ b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml @@ -59,6 +59,163 @@ stages: restart_count: 1 api_server_calls_per_second: 20 trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4a — Scenario #2 (Pod Churn Stress). + # Each matrix entry runs the full provision → execute → destroy + # lifecycle independently (matrix entries do NOT share Fleet/RG); + # enable selectively in the AzDO UI to control per-run cost. + n2_pod_churn_scale: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: pod-churn-scale.yaml + test_type: pod-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + # 5 cycles × (60s up + 60s down) ≈ 10 min sustained churn — + # spec line 67 "CPU/memory growth over time" measurement window. + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + trigger_reason: ${{ variables['Build.Reason'] }} + n2_pod_churn_kill: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: pod-churn-kill.yaml + test_type: pod-churn-kill + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + # In-cluster killer Job loops for kill_duration_seconds, deleting + # kill_batch random workload pods every kill_interval_seconds. + # kill_job_deadline_seconds is the Job activeDeadlineSeconds — + # defense-in-depth bound; must exceed kill_duration_seconds. + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #3 (Node Churn / IP Churn). The stimulus + # (az aks nodepool scale / VMSS instance delete) runs OUTSIDE + # CL2 from steps/engine/clusterloader2/clustermesh-scale/execute.yml + # in a background subshell; CL2 deploys a baseline workload on + # every cluster and observes via measurements (node-churn.yaml). + # See modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh + # for the script header. mesh_size-wide concurrency override + # forced in execute.yml (needs_mesh_wide_concurrency). + n2_node_churn_scale: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: node-churn-scale.yaml + test_type: node-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + # Node-churn knobs — see scale.py configure for semantics. Defaults + # in execute.yml fill in when matrix entry omits them, but we set + # them explicitly for traceability. + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_scale_duration_seconds: 1800 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n2_node_churn_replace: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: node-churn-replace.yaml + test_type: node-churn-replace + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_settle_seconds: 60 + node_churn_replace_duration_seconds: 1500 + # node_replace_batch_size: 10 default; bounded above by original + # pool size (20) so 10 = 50%% replacement is the sweet spot for + # mesh propagation pressure without saturating Cilium endpoint + # reconcile under our DSv3 budget. + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n2_node_churn_combined: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: node-churn-combined.yaml + test_type: node-churn-combined + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_combined_duration_seconds: 3300 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #6 (Upper Bound / Saturation Testing). + # In-run rung loop sweeps QPS across the configured list; each + # rung restart-bursts the workload at that QPS for + # saturation_rung_duration_seconds. scale.py collect's + # classifier tags each rung with the dominant signal + # (clean | latency_spike | queue_unbounded | cpu_exhaust | + # mesh_failure_burst | etcd_tail) — see SATURATION_THRESHOLDS + # in scale.py + plan.md Scenario #6 section. + # + # Mesh-wide concurrency forced in execute.yml + # (needs_mesh_wide_concurrency) so every cluster's CL2 runs + # simultaneously — per-cluster saturation point is meaningless + # if peers aren't also loaded. + # + # NOT share-infra-eligible in v1: a tripped rung can leave + # queue/memory residue that would contaminate following + # scenarios. Standalone matrix entry only until baseline data + # justifies share-infra positioning. + n2_upper_bound: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: upper-bound.yaml + test_type: upper-bound + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + # Baseline QPS used by the workload-create phase (first rung's + # QPS, gentle). Per-rung QPS comes from saturation_qps_list. + api_server_calls_per_second: 20 + # 4-rung sweep. n=2 smoke uses smaller-amplitude defaults so + # the first run doesn't trip Azure-side limits before the + # classifier thresholds have been calibrated. Bump for prod + # after first n=2 + n=20 greens. + saturation_qps_list: "100,500,1500,4000,10000" + saturation_restarts_list: "2,4,8,15,25" + saturation_rung_duration_seconds: 240 + saturation_settle_seconds: 90 + trigger_reason: ${{ variables['Build.Reason'] }} max_parallel: 1 timeout_in_minutes: 120 credential_type: service_connection @@ -66,4 +223,508 @@ stages: # Iteration-only: skip uploading results to the telescope blob while # we're still stabilizing the clustermesh-scale pipeline. Flip to # false (or remove) once results are meaningful. - skip_publish: true + skip_publish: false + + # Phase 3 — 5-cluster tier. Separate stage (not a matrix entry) because + # `terraform_input_file_mapping` is set at the job level, so different + # cluster counts require different stages bound to different tfvars files. + - stage: azure_eastus2euap_n5 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars" + matrix: + n5_event_throughput: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: event-throughput.yaml + test_type: event-throughput + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4a — Scenario #2 (Pod Churn Stress). + n5_pod_churn_scale: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: pod-churn-scale.yaml + test_type: pod-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + trigger_reason: ${{ variables['Build.Reason'] }} + n5_pod_churn_kill: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: pod-churn-kill.yaml + test_type: pod-churn-kill + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #3 (Node Churn / IP Churn). See n2 entry + # for the full design rationale; only mesh_size differs at this tier. + n5_node_churn_scale: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: node-churn-scale.yaml + test_type: node-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_scale_duration_seconds: 1800 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n5_node_churn_replace: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: node-churn-replace.yaml + test_type: node-churn-replace + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_settle_seconds: 60 + node_churn_replace_duration_seconds: 1500 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n5_node_churn_combined: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: node-churn-combined.yaml + test_type: node-churn-combined + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_combined_duration_seconds: 3300 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #6 (Upper Bound / Saturation). See n2 + # entry for the full design rationale; only mesh_size differs + # at this tier. Same QPS sweep at every tier so the per-tier + # saturation point is directly comparable across clusters axis. + n5_upper_bound: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: upper-bound.yaml + test_type: upper-bound + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + saturation_qps_list: "100,500,1500,4000,10000" + saturation_restarts_list: "2,4,8,15,25" + saturation_rung_duration_seconds: 240 + saturation_settle_seconds: 90 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + # 5-cluster provision adds ~10-15 min vs n2 (more terraform + fleet + # member creates + RBAC propagation); CL2 fan-out itself stays + # bounded at concurrency 4 so per-cluster wall-clock is unchanged. + timeout_in_minutes: 180 + credential_type: service_connection + ssh_key_enabled: false + skip_publish: false + + # Phase 3 — 10-cluster tier. Per-cluster sizing identical to n2/n5; + # only mesh size scales. Quota footprint per run: ~120 vCPU + # (10x default-pool D4s_v5 + 10x prompool D8s_v3). 90 VNet peerings. + - stage: azure_eastus2euap_n10 + dependsOn: [] + # See dev pipeline (pipelines/system/new-pipeline-test.yml) for the + # full rationale on TF_CLI_ARGS_apply=-parallelism=4: at default + # parallelism=10 the regional AKS RP throttles severely on 10 + # simultaneous `az aks create` calls. + variables: + TF_CLI_ARGS_apply: "-parallelism=4" + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars" + matrix: + n10_event_throughput: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: event-throughput.yaml + test_type: event-throughput + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4a — Scenario #2 (Pod Churn Stress). + n10_pod_churn_scale: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: pod-churn-scale.yaml + test_type: pod-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + trigger_reason: ${{ variables['Build.Reason'] }} + n10_pod_churn_kill: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: pod-churn-kill.yaml + test_type: pod-churn-kill + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #3 (Node Churn / IP Churn). + n10_node_churn_scale: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: node-churn-scale.yaml + test_type: node-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_scale_duration_seconds: 1800 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n10_node_churn_replace: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: node-churn-replace.yaml + test_type: node-churn-replace + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_settle_seconds: 60 + node_churn_replace_duration_seconds: 1500 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n10_node_churn_combined: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: node-churn-combined.yaml + test_type: node-churn-combined + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_combined_duration_seconds: 3300 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #6 (Upper Bound / Saturation) at n=10. + n10_upper_bound: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: upper-bound.yaml + test_type: upper-bound + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + saturation_qps_list: "100,500,1500,4000,10000" + saturation_restarts_list: "2,4,8,15,25" + saturation_rung_duration_seconds: 240 + saturation_settle_seconds: 90 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + # 10-cluster provision adds ~10-15 min vs n5 (more terraform + + # fleet member creates + ARM throughput); CL2 fan-out itself + # stays bounded at concurrency 4 (10/4 batches sequentially). + timeout_in_minutes: 240 + credential_type: service_connection + ssh_key_enabled: false + skip_publish: false + + # Phase 3 — 20-cluster tier (final scale-test point per spec line 25). + # Per-cluster sizing identical to lower tiers; only mesh size scales. + # Quota footprint: ~320 vCPU (20x D4s_v5 + 20x D8s_v3). 380 VNet peerings. + # See dev pipeline n20 stage for full rationale on TF_CLI_ARGS_apply. + - stage: azure_eastus2euap_n20 + dependsOn: [] + variables: + TF_CLI_ARGS_apply: "-parallelism=8" + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars" + matrix: + n20_event_throughput: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: event-throughput.yaml + test_type: event-throughput + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4a — Scenario #2 (Pod Churn Stress). Each entry is a + # separate full lifecycle (~6h at n20). Enable selectively. + n20_pod_churn_scale: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: pod-churn-scale.yaml + test_type: pod-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + trigger_reason: ${{ variables['Build.Reason'] }} + n20_pod_churn_kill: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: pod-churn-kill.yaml + test_type: pod-churn-kill + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + trigger_reason: ${{ variables['Build.Reason'] }} + # Combined scale-cycle + kill in one CL2 invocation per cluster. + # Maximizes signal per (expensive) n20 provision/destroy lifecycle. + # Kill phase uses Method: Exec → kubectl from inside the CL2 + # container (no in-cluster Job, no AcrPull dependency). If kubectl + # is unavailable in the CL2 image, the kill measurement is marked + # failed but scale-phase data still lands cleanly. + n20_pod_churn_combined: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: pod-churn-combined.yaml + test_type: pod-churn-combined + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #3 (Node Churn / IP Churn) at n=20. + # Each entry is a separate provision/destroy lifecycle (~6.5h + # at n=20 including the ~30-55min node-churn window itself). + # Enable selectively in AzDO UI. + n20_node_churn_scale: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: node-churn-scale.yaml + test_type: node-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_scale_duration_seconds: 1800 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n20_node_churn_replace: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: node-churn-replace.yaml + test_type: node-churn-replace + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_settle_seconds: 60 + node_churn_replace_duration_seconds: 1500 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + n20_node_churn_combined: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: node-churn-combined.yaml + test_type: node-churn-combined + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_combined_duration_seconds: 3300 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #6 (Upper Bound / Saturation) at n=20. + # Highest mesh-pressure tier. Default thresholds calibrated on + # lower tiers; expect more rungs to trip at n=20 (more peers + # to propagate to per event). First n=20 run is the + # ground-truth calibration data point. + n20_upper_bound: + cluster_count: 20 + mesh_size: 20 + cl2_config_file: upper-bound.yaml + test_type: upper-bound + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + saturation_qps_list: "100,500,1500,4000,10000" + saturation_restarts_list: "2,4,8,15,25" + saturation_rung_duration_seconds: 240 + saturation_settle_seconds: 90 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + timeout_in_minutes: 480 + credential_type: service_connection + ssh_key_enabled: false + skip_publish: false diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 38ea068658..8f88419935 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -16,6 +16,10 @@ variables: OWNER: aks stages: + # 2026-05-13: Phase 4b smoke at n=2 to validate Option B++ fix + # (execute always exit 0 + SucceededWithIssues marker) + soft-fail + # killer + 240s recovery timeout. Re-disable n=2 + enable n=20 once + # this lands clean. - stage: azure_eastus2euap dependsOn: [] jobs: @@ -42,25 +46,463 @@ stages: # entry. We don't run it in dev — n2_event_throughput already exercises # the full plumbing and per-run cost (full Fleet/AKS lifecycle ~15-20 min) # makes a second axis expensive during iteration. - n2_event_throughput: + # SMOKE-ONLY 2026-05-11: Phase 4a n=2 smoke runs ONLY the combined + # entry. The other 3 entries (event_throughput, pod_churn_scale, + # pod_churn_kill) are commented out so a triggered run doesn't + # spend 4× the lifecycle cost. Uncomment after n=2 smoke is green + # to restore full coverage (each entry is one provision/destroy). + # n2_event_throughput: + # cluster_count: 2 + # mesh_size: 2 + # cl2_config_file: event-throughput.yaml + # test_type: event-throughput + # namespaces: 5 + # deployments_per_namespace: 4 + # replicas_per_deployment: 10 + # hold_duration: 2m + # warmup_duration: 30s + # restart_count: 1 + # api_server_calls_per_second: 20 + # trigger_reason: ${{ variables['Build.Reason'] }} + # n2_pod_churn_scale: + # cluster_count: 2 + # mesh_size: 2 + # cl2_config_file: pod-churn-scale.yaml + # test_type: pod-churn-scale + # namespaces: 5 + # deployments_per_namespace: 4 + # replicas_per_deployment: 10 + # hold_duration: 2m + # warmup_duration: 30s + # restart_count: 0 + # api_server_calls_per_second: 20 + # churn_cycles: 5 + # churn_up_duration: 60s + # churn_down_duration: 60s + # trigger_reason: ${{ variables['Build.Reason'] }} + # n2_pod_churn_kill: + # cluster_count: 2 + # mesh_size: 2 + # cl2_config_file: pod-churn-kill.yaml + # test_type: pod-churn-kill + # namespaces: 5 + # deployments_per_namespace: 4 + # replicas_per_deployment: 10 + # hold_duration: 2m + # warmup_duration: 30s + # restart_count: 0 + # api_server_calls_per_second: 20 + # kill_duration: 10m + # kill_duration_seconds: 600 + # kill_interval_seconds: 10 + # kill_batch: 5 + # kill_job_deadline_seconds: 660 + # trigger_reason: ${{ variables['Build.Reason'] }} + # Combined scale-cycle + kill in one CL2 invocation per cluster. + # Kill phase uses Method: Exec → kubectl from inside the CL2 + # container (no in-cluster Job, no AcrPull dependency). + # SMOKE-ONLY 2026-05-12: commented out for n=2 share-infra smoke; + # uncomment for solo-scenario iteration. + # n2_pod_churn_combined: + # cluster_count: 2 + # mesh_size: 2 + # cl2_config_file: pod-churn-combined.yaml + # test_type: pod-churn-combined + # namespaces: 5 + # deployments_per_namespace: 4 + # replicas_per_deployment: 10 + # hold_duration: 2m + # warmup_duration: 30s + # restart_count: 0 + # api_server_calls_per_second: 20 + # churn_cycles: 5 + # churn_up_duration: 60s + # churn_down_duration: 60s + # kill_duration: 10m + # kill_duration_seconds: 600 + # kill_interval_seconds: 10 + # kill_batch: 5 + # kill_job_deadline_seconds: 660 + # trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b share-infra: ONE matrix entry runs BOTH scenarios + # sequentially against the same provisioned clusters. The + # share_infra_scenarios env var (auto-exported as + # SHARE_INFRA_SCENARIOS by AzDO) triggers the multi-scenario + # path in execute.yml + collect.yml. Per-row test_type + # attribution preserved in the JSONL. Single provision/destroy + # = ~92% time reduction vs running two matrix entries. + # + # ITER-ONLY 2026-05-14: commented out for scenario #6 smoke. + # n2_shared was previously narrowed to "node-churn-combined" + # for #3 iteration; #3 is now green at K=10 (build 67185) so + # there's no need to re-run it alongside the #6 first smoke. + # Restore + widen this entry to the 5-scenario share-infra + # list AFTER #6 lands (planned post-#6 work per SETTLED DESIGN): + # share_infra_scenarios: "event-throughput,pod-churn-combined,apiserver-failure,ha-config,isolation" + # n2_shared: + # cluster_count: 2 + # mesh_size: 2 + # # Phase 4b — 5-scenario share-infra validation: + # # event-throughput (#1), pod-churn-combined (#2), + # # apiserver-failure (#4), ha-config (#7), isolation (#5), + # # node-churn-combined (#3). + # # ha-config is BEFORE isolation so its scale-down restores + # # the apiserver Deployment to 1 replica before isolation's + # # heavy pod-churn loop runs on the target cluster. + # # node-churn-combined is LAST per rubber-duck design review + # # #11 — node ops can leave the target cluster in a half- + # # scaled state if the finalizer can't restore. Putting + # # node-churn last means contamination affects no further + # # scenarios in the share-infra lifecycle. + # share_infra_scenarios: "node-churn-combined" + # cl2_config_file: "" # unused when share_infra_scenarios is set + # test_type: shared # row-level test_type comes from each scenario at collect time + # namespaces: 5 + # deployments_per_namespace: 4 + # replicas_per_deployment: 10 + # hold_duration: 2m + # warmup_duration: 30s + # restart_count: 1 + # api_server_calls_per_second: 20 + # churn_cycles: 5 + # churn_up_duration: 60s + # churn_down_duration: 60s + # kill_duration: 10m + # kill_duration_seconds: 600 + # kill_interval_seconds: 10 + # kill_batch: 5 + # kill_job_deadline_seconds: 660 + # # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs. + # apiserver_kill_target_context: clustermesh-1 + # apiserver_kill_recovery_timeout_seconds: 240 + # apiserver_kill_observation_seconds: 60 + # # Phase 4b — Scenario #7 (HA Configuration Validation) knob. + # ha_config_replicas: 3 + # # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs. + # node_churn_target_context: clustermesh-1 + # node_churn_cycles: 2 + # node_churn_delta: 3 + # node_churn_settle_seconds: 60 + # node_churn_scale_duration_seconds: 1500 + # node_churn_replace_duration_seconds: 1500 + # node_churn_combined_duration_seconds: 2700 + # node_replace_batch_size: 10 + # node_churn_ready_timeout_seconds: 300 + # trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4b — Scenario #6 (Upper Bound / Saturation) standalone + # smoke entry. Per SETTLED DESIGN in plan.md (line ~126), we do + # NOT widen n2_shared to include #6 — the share-infra-list + # rollup happens AFTER #6 lands. CL2 image, tfvars, and timeout + # budget are identical to the prod pipeline so signals are + # directly comparable. + n2_upper_bound: cluster_count: 2 mesh_size: 2 - cl2_config_file: event-throughput.yaml - test_type: event-throughput + cl2_config_file: upper-bound.yaml + test_type: upper-bound namespaces: 5 deployments_per_namespace: 4 replicas_per_deployment: 10 hold_duration: 2m warmup_duration: 30s - restart_count: 1 + restart_count: 0 api_server_calls_per_second: 20 + saturation_qps_list: "100,500,1500,4000,10000" + saturation_restarts_list: "2,4,8,15,25" + saturation_rung_duration_seconds: 240 + saturation_settle_seconds: 90 trigger_reason: ${{ variables['Build.Reason'] }} max_parallel: 1 - timeout_in_minutes: 120 + # n=2 share-infra (5 scenarios): provision (~15min) + validate (~5min) + # + 5 × CL2 (~25min each, with 60s settle between) + destroy (~15min) + # ≈ ~170min. Buffer to 360 for LB-tail / apply retries. + # The n2_upper_bound entry runs the same provision/destroy + # lifecycle but its CL2 phase is ~16min (4 rungs × 240s); same + # 360min budget covers both with headroom. + timeout_in_minutes: 360 credential_type: service_connection ssh_key_enabled: false # Iteration-only: skip uploading results to the telescope blob while # we're still stabilizing the clustermesh-scale pipeline. Mirrors the # same flag in pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml. # Flip to false (or remove) once results are meaningful. - skip_publish: true + skip_publish: false + + # Phase 3 — 5-cluster tier. Separate stage (not a matrix entry) because + # `terraform_input_file_mapping` is set at the job level, so different + # cluster counts require different stages bound to different tfvars files. + # Runs in parallel with the n2 stage when pool capacity allows; comment + # out either stage during iteration if the dual cost matters. + - stage: azure_eastus2euap_n5 + dependsOn: [] + # ITER-DISABLED 2026-05-08 (inline comments on `condition:` are unsafe — + # AzDO doesn't always strip them, leaving the truthy string + # "false # ..." as the expression. Keep the marker on its own line.) + condition: false + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars" + matrix: + n5_event_throughput: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: event-throughput.yaml + test_type: event-throughput + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4a — Scenario #2 (Pod Churn Stress). + n5_pod_churn_scale: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: pod-churn-scale.yaml + test_type: pod-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + trigger_reason: ${{ variables['Build.Reason'] }} + n5_pod_churn_kill: + cluster_count: 5 + mesh_size: 5 + cl2_config_file: pod-churn-kill.yaml + test_type: pod-churn-kill + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + timeout_in_minutes: 180 + credential_type: service_connection + ssh_key_enabled: false + skip_publish: false + + # Phase 3 — 10-cluster tier. Per-cluster sizing identical to n2/n5; + # only mesh size scales. Quota footprint per run: ~120 vCPU + # (10x default-pool D4s_v5 + 10x prompool D8s_v3). 90 VNet peerings. + - stage: azure_eastus2euap_n10 + dependsOn: [] + # ITER-DISABLED 2026-05-08 + condition: false + # Lower terraform apply parallelism from default 10 to 4. At default, + # all 10 `az aks create` calls fire simultaneously and the regional AKS + # RP throttles severely — observed N=10 first run had every cluster + # stuck in `aks_cli: Still creating` for 190+ min (vs. 5-10 min normal). + # Parallelism=4 lets the RP process creates in batches: roughly + # 4-create wave (~10 min) then 4-create wave then 2-create wave → + # ~30 min total apply instead of 4hr+. CL2 fan-out parallelism + # (max_concurrent=4) is a SEPARATE knob and stays unchanged. Destroy + # is unaffected (uses TF_CLI_ARGS_apply, not TF_CLI_ARGS). + variables: + TF_CLI_ARGS_apply: "-parallelism=4" + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars" + matrix: + n10_event_throughput: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: event-throughput.yaml + test_type: event-throughput + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + trigger_reason: ${{ variables['Build.Reason'] }} + # Phase 4a — Scenario #2 (Pod Churn Stress). + n10_pod_churn_scale: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: pod-churn-scale.yaml + test_type: pod-churn-scale + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + trigger_reason: ${{ variables['Build.Reason'] }} + n10_pod_churn_kill: + cluster_count: 10 + mesh_size: 10 + cl2_config_file: pod-churn-kill.yaml + test_type: pod-churn-kill + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 0 + api_server_calls_per_second: 20 + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + # 10-cluster provision adds ~10-15 min vs n5 (more terraform + + # fleet member creates + ARM throughput); CL2 fan-out itself + # stays bounded at concurrency 4 (10/4 batches sequentially). + timeout_in_minutes: 240 + credential_type: service_connection + ssh_key_enabled: false + skip_publish: false + + # Phase 3 — 20-cluster tier (final scale-test point per spec line 25). + # Per-cluster sizing identical to lower tiers; only mesh size scales. + # Quota footprint per run (validated 2026-05-08 in eastus2euap with + # 78k vCPU headroom): ~320 vCPU (20x D4s_v5 + 20x D8s_v3). 380 VNet + # peering links (N*(N-1) at separate-VNet mode). 20 Fleet members. + # + # TF_CLI_ARGS_apply tuning history at this tier: + # - default parallelism=10 (aks-cli implicit): cluster-create RP throttle, + # all 20 stuck "Still creating" for hours. + # - parallelism=4 (first n20 attempt 2026-05-09): apply 219 min (3.65 hr). + # Real bottleneck shifts from AKS RP to terraform graph traversal of + # 520+ resources (380 peerings + 20 fleet members + per-cluster waits). + # - parallelism=8 (this run): split-the-difference. Cluster-creates still + # batch (20/8 = ~3 batches), but graph traversal of peerings/members is + # 2x faster than parallelism=4. Risk: AKS RP could throttle harder than + # parallelism=4. Fallback if this fails: drop back to parallelism=4. + - stage: azure_eastus2euap_n20 + dependsOn: [] + # ITER-DISABLED 2026-05-13: Phase 4b smoke at n=2 first to validate + # the Option B++ exit-0+SucceededWithIssues fix. Re-enable when + # ready to promote. + condition: false + variables: + TF_CLI_ARGS_apply: "-parallelism=8" + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars" + matrix: + # Phase 4b — n=20 share-infra overnight run. + # Runs 3 scenarios in ONE provision/destroy lifecycle: + # 1. event-throughput (scenario #1 baseline with CFP-39876 fix) + # 2. pod-churn-combined (scenario #2 scale + kill phases) + # 3. apiserver-failure (scenario #4 — Phase 4b's new scenario) + # Compresses what would be 3 × 6h = 18h of separate lifecycles + # into ~7-8h shared. + # + # cl2_max_concurrent=8: bumped from default 4 so more peer + # clusters' Prometheus are running during scenario #4's kill + # window. At default 4, only 3 of 19 peers would be in flight + # when mesh-1 is killed. At 8: ~7 peers. Marginal agent memory + # increase, much better peer coverage. + # + # SMOKE-ONLY: solo-scenario matrix entries below commented out + # so this overnight run produces exactly one results blob from + # the shared lifecycle. Uncomment for solo iteration. + # n20_event_throughput: ... + # n20_pod_churn_combined: ... + n20_shared: + cluster_count: 20 + mesh_size: 20 + share_infra_scenarios: "event-throughput,pod-churn-combined,apiserver-failure,ha-config,isolation,node-churn-combined" + cl2_config_file: "" # unused in share-infra mode + test_type: shared # row-level test_type comes from each scenario + cl2_max_concurrent: 8 + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + churn_cycles: 5 + churn_up_duration: 60s + churn_down_duration: 60s + kill_duration: 10m + kill_duration_seconds: 600 + kill_interval_seconds: 10 + kill_batch: 5 + kill_job_deadline_seconds: 660 + apiserver_kill_target_context: clustermesh-1 + apiserver_kill_recovery_timeout_seconds: 240 + apiserver_kill_observation_seconds: 60 + ha_config_replicas: 3 + # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs at n=20. + # Positioned LAST in share_infra_scenarios per rubber-duck + # design review #11 (node ops can leave target half-scaled + # if finalizer can't restore; putting it last contains the + # blast radius). + node_churn_target_context: clustermesh-1 + node_churn_cycles: 3 + node_churn_delta: 5 + node_churn_settle_seconds: 60 + node_churn_scale_duration_seconds: 1800 + node_churn_replace_duration_seconds: 1500 + node_churn_combined_duration_seconds: 3300 + node_replace_batch_size: 10 + node_churn_ready_timeout_seconds: 300 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + # n=20 share-infra (3 scenarios): provision (~4h) + validate (~30min) + # + 3 × CL2 (~25min each, with 60s settle between) + destroy (~1.5h) + # ≈ ~7.5h baseline. Phase 4a's last n=20 hit 480 min during destroy + # so we go to 720 (12h) for safe overnight headroom. + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false + skip_publish: false diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars new file mode 100644 index 0000000000..90e6c7e542 --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars @@ -0,0 +1,579 @@ +scenario_type = "perf-eval" +scenario_name = "clustermesh-scale" +deletion_delay = "4h" +owner = "aks" + +# ============================================================================= +# ClusterMesh Scale Test — 10 cluster tier +# +# Same shape as azure-2.tfvars (see that file for full sizing rationale on +# pod CIDR, max-pods, prompool, etc.). This file scales the cluster count +# only; per-cluster sizing is identical to the n2 tier so cluster-count is +# the only variable when comparing tier results. +# +# Generated topology: +# - 10 VNets (one per cluster) at 10..0.0/16, id=1..10 +# - 10 AKS clusters (Cilium+ACNS, Azure CNI w/ pod subnet) +# - 90 VNet peering links (N*(N-1) at separate-VNet mode) +# - 10 Fleet members (label mesh=true) + 1 clustermeshprofile +# +# Subscription footprint per run (20-node baseline per spec line 24): +# - default pool: 10 clusters x 20 nodes x D4s_v3 (4 vCPU) = 800 vCPU (DSv3 family) +# - prompool: 10 clusters x 1 node x D8s_v3 (8 vCPU) = 80 vCPU (DSv3 family) +# - total DSv3 compute: 880 vCPU +# Verify region quota before first run (DSv3 limit is typically 5000 vCPU +# in eastus2euap; check `az vm list-usage --location eastus2euap`). +# ============================================================================= + +network_config_list = [ + { + role = "mesh-1" + vnet_name = "clustermesh-1-vnet" + vnet_address_space = "10.1.0.0/16" + subnet = [ + { + name = "clustermesh-1-node" + address_prefix = "10.1.0.0/24" + }, + { + name = "clustermesh-1-pod" + address_prefix = "10.1.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-2" + vnet_name = "clustermesh-2-vnet" + vnet_address_space = "10.2.0.0/16" + subnet = [ + { + name = "clustermesh-2-node" + address_prefix = "10.2.0.0/24" + }, + { + name = "clustermesh-2-pod" + address_prefix = "10.2.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-3" + vnet_name = "clustermesh-3-vnet" + vnet_address_space = "10.3.0.0/16" + subnet = [ + { + name = "clustermesh-3-node" + address_prefix = "10.3.0.0/24" + }, + { + name = "clustermesh-3-pod" + address_prefix = "10.3.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-4" + vnet_name = "clustermesh-4-vnet" + vnet_address_space = "10.4.0.0/16" + subnet = [ + { + name = "clustermesh-4-node" + address_prefix = "10.4.0.0/24" + }, + { + name = "clustermesh-4-pod" + address_prefix = "10.4.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-5" + vnet_name = "clustermesh-5-vnet" + vnet_address_space = "10.5.0.0/16" + subnet = [ + { + name = "clustermesh-5-node" + address_prefix = "10.5.0.0/24" + }, + { + name = "clustermesh-5-pod" + address_prefix = "10.5.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-6" + vnet_name = "clustermesh-6-vnet" + vnet_address_space = "10.6.0.0/16" + subnet = [ + { + name = "clustermesh-6-node" + address_prefix = "10.6.0.0/24" + }, + { + name = "clustermesh-6-pod" + address_prefix = "10.6.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-7" + vnet_name = "clustermesh-7-vnet" + vnet_address_space = "10.7.0.0/16" + subnet = [ + { + name = "clustermesh-7-node" + address_prefix = "10.7.0.0/24" + }, + { + name = "clustermesh-7-pod" + address_prefix = "10.7.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-8" + vnet_name = "clustermesh-8-vnet" + vnet_address_space = "10.8.0.0/16" + subnet = [ + { + name = "clustermesh-8-node" + address_prefix = "10.8.0.0/24" + }, + { + name = "clustermesh-8-pod" + address_prefix = "10.8.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-9" + vnet_name = "clustermesh-9-vnet" + vnet_address_space = "10.9.0.0/16" + subnet = [ + { + name = "clustermesh-9-node" + address_prefix = "10.9.0.0/24" + }, + { + name = "clustermesh-9-pod" + address_prefix = "10.9.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-10" + vnet_name = "clustermesh-10-vnet" + vnet_address_space = "10.10.0.0/16" + subnet = [ + { + name = "clustermesh-10-node" + address_prefix = "10.10.0.0/24" + }, + { + name = "clustermesh-10-pod" + address_prefix = "10.10.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_cli_config_list = [ + { + role = "mesh-1" + aks_name = "clustermesh-1" + sku_tier = "Standard" + subnet_name = "clustermesh-1-node" + pod_subnet_name = "clustermesh-1-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-2" + aks_name = "clustermesh-2" + sku_tier = "Standard" + subnet_name = "clustermesh-2-node" + pod_subnet_name = "clustermesh-2-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-3" + aks_name = "clustermesh-3" + sku_tier = "Standard" + subnet_name = "clustermesh-3-node" + pod_subnet_name = "clustermesh-3-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-4" + aks_name = "clustermesh-4" + sku_tier = "Standard" + subnet_name = "clustermesh-4-node" + pod_subnet_name = "clustermesh-4-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-5" + aks_name = "clustermesh-5" + sku_tier = "Standard" + subnet_name = "clustermesh-5-node" + pod_subnet_name = "clustermesh-5-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-6" + aks_name = "clustermesh-6" + sku_tier = "Standard" + subnet_name = "clustermesh-6-node" + pod_subnet_name = "clustermesh-6-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-7" + aks_name = "clustermesh-7" + sku_tier = "Standard" + subnet_name = "clustermesh-7-node" + pod_subnet_name = "clustermesh-7-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-8" + aks_name = "clustermesh-8" + sku_tier = "Standard" + subnet_name = "clustermesh-8-node" + pod_subnet_name = "clustermesh-8-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-9" + aks_name = "clustermesh-9" + sku_tier = "Standard" + subnet_name = "clustermesh-9-node" + pod_subnet_name = "clustermesh-9-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-10" + aks_name = "clustermesh-10" + sku_tier = "Standard" + subnet_name = "clustermesh-10-node" + pod_subnet_name = "clustermesh-10-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + } +] + +# ============================================================================= +# Fleet + ClusterMesh +# ============================================================================= +vnet_peering_config = { + enabled = true +} + +fleet_config = { + enabled = true + fleet_name = "clustermesh-flt" + cmp_name = "clustermesh-cmp" + member_label_key = "mesh" + member_label_value = "true" + members = [ + { member_name = "mesh-1", aks_role = "mesh-1" }, + { member_name = "mesh-2", aks_role = "mesh-2" }, + { member_name = "mesh-3", aks_role = "mesh-3" }, + { member_name = "mesh-4", aks_role = "mesh-4" }, + { member_name = "mesh-5", aks_role = "mesh-5" }, + { member_name = "mesh-6", aks_role = "mesh-6" }, + { member_name = "mesh-7", aks_role = "mesh-7" }, + { member_name = "mesh-8", aks_role = "mesh-8" }, + { member_name = "mesh-9", aks_role = "mesh-9" }, + { member_name = "mesh-10", aks_role = "mesh-10" } + ] +} diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars index 535bdba5a7..fcc90c2bb9 100644 --- a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars +++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars @@ -91,14 +91,28 @@ aks_cli_config_list = [ { name = "max-pods", value = "110" }, ] - # Default pool sizing: D4s_v5 (4 vCPU / 16GB) is enough for the workload - # pods alone. Prometheus is pinned to prompool below — without that - # split, Prometheus's 1Gi+ memory request co-tenanting on default-pool - # nodes caused per-node CPU overcommit (~160% allocatable) and left - # workload pods stuck Pending. + # Default pool sizing: 20 nodes × D4ds_v4 (4 vCPU / 16GB). + # + # 20 nodes per cluster is the spec baseline (scale testing.txt line 24: + # "20-node clusters as the baseline unit"). Workload sits on this pool; + # Prometheus is pinned to prompool below to avoid the per-node CPU + # overcommit + Pending-pods we hit when Prometheus co-tenanted with the + # workload at smaller node counts. + # + # SKU choice — D4s_v5 (iter-narrow for scenario #6 smoke 2026-05-15 + # subscription switch): 4 vCPU / 16GB / Premium SSD, Ice Lake v5 + # generation. Switched from D4ds_v4 because we moved this pipeline to + # subscription 37deca37-... ("Azure Network Agent - Standalone Test") + # to dodge RG-count quota pressure on the original 9b8218f9-... + # subscription. On 37deca37 the DDSv4 family has only 100 vCPU quota + # (need 160+ at n=2), but DSv5 has 1000 vCPU quota with 920 free, so + # D4s_v5/D8s_v5 fits with headroom. Larger tiers (n5/n10/n20) still + # need quota planning on the new sub before promotion. + # Performance for our workload (mostly idle pause pods + cilium-agent + # + CL2 measurement client) is not bound on CPU generation. default_node_pool = { name = "default" - node_count = 2 + node_count = 20 auto_scaling_enabled = false vm_size = "Standard_D4s_v5" } @@ -108,15 +122,15 @@ aks_cli_config_list = [ # only on this label, so it doesn't compete with workload pods. Mirrors # the `prompool` pattern from # scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars. - # D8s_v3 (8 vCPU / 32GB) is sized for our 1Gi-request Prometheus with - # ample headroom — much smaller than #1053's D32s_v5 because our - # workload spec is also much smaller. + # D8s_v5 (8 vCPU / 32GB) is sized for our 1Gi-request Prometheus with + # ample headroom; matches the family swap of the default pool (DSv5 + # quota of 1000 vCPU on subscription 37deca37 fits n=2 with margin). extra_node_pool = [ { name = "prompool" node_count = 1 auto_scaling_enabled = false - vm_size = "Standard_D8s_v3" + vm_size = "Standard_D8s_v5" optional_parameters = [ { name = "labels", value = "prometheus=true" }, ] @@ -141,7 +155,7 @@ aks_cli_config_list = [ default_node_pool = { name = "default" - node_count = 2 + node_count = 20 auto_scaling_enabled = false vm_size = "Standard_D4s_v5" } @@ -150,7 +164,7 @@ aks_cli_config_list = [ name = "prompool" node_count = 1 auto_scaling_enabled = false - vm_size = "Standard_D8s_v3" + vm_size = "Standard_D8s_v5" optional_parameters = [ { name = "labels", value = "prometheus=true" }, ] diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars new file mode 100644 index 0000000000..26a94dbabd --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars @@ -0,0 +1,1109 @@ +scenario_type = "perf-eval" +scenario_name = "clustermesh-scale" +deletion_delay = "4h" +owner = "aks" + +# ============================================================================= +# ClusterMesh Scale Test — 20 cluster tier +# +# Same shape as azure-2.tfvars (see that file for full sizing rationale on +# pod CIDR, max-pods, prompool, etc.). This file scales the cluster count +# only; per-cluster sizing is identical to the n2 tier so cluster-count is +# the only variable when comparing tier results. +# +# Generated topology: +# - 20 VNets (one per cluster) at 10..0.0/16, id=1..20 +# - 20 AKS clusters (Cilium+ACNS, Azure CNI w/ pod subnet) +# - 380 VNet peering links (N*(N-1) at separate-VNet mode) +# - 20 Fleet members (label mesh=true) + 1 clustermeshprofile +# +# Subscription footprint per run (20-node baseline per spec line 24): +# - default pool: 20 clusters x 20 nodes x D4s_v3 (4 vCPU) = 1600 vCPU (DSv3 family) +# - prompool: 20 clusters x 1 node x D8s_v3 (8 vCPU) = 160 vCPU (DSv3 family) +# - total DSv3 compute: 1760 vCPU +# Verify region quota before first run (DSv3 limit is typically 5000 vCPU +# in eastus2euap; check `az vm list-usage --location eastus2euap`). +# ============================================================================= + +network_config_list = [ + { + role = "mesh-1" + vnet_name = "clustermesh-1-vnet" + vnet_address_space = "10.1.0.0/16" + subnet = [ + { + name = "clustermesh-1-node" + address_prefix = "10.1.0.0/24" + }, + { + name = "clustermesh-1-pod" + address_prefix = "10.1.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-2" + vnet_name = "clustermesh-2-vnet" + vnet_address_space = "10.2.0.0/16" + subnet = [ + { + name = "clustermesh-2-node" + address_prefix = "10.2.0.0/24" + }, + { + name = "clustermesh-2-pod" + address_prefix = "10.2.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-3" + vnet_name = "clustermesh-3-vnet" + vnet_address_space = "10.3.0.0/16" + subnet = [ + { + name = "clustermesh-3-node" + address_prefix = "10.3.0.0/24" + }, + { + name = "clustermesh-3-pod" + address_prefix = "10.3.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-4" + vnet_name = "clustermesh-4-vnet" + vnet_address_space = "10.4.0.0/16" + subnet = [ + { + name = "clustermesh-4-node" + address_prefix = "10.4.0.0/24" + }, + { + name = "clustermesh-4-pod" + address_prefix = "10.4.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-5" + vnet_name = "clustermesh-5-vnet" + vnet_address_space = "10.5.0.0/16" + subnet = [ + { + name = "clustermesh-5-node" + address_prefix = "10.5.0.0/24" + }, + { + name = "clustermesh-5-pod" + address_prefix = "10.5.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-6" + vnet_name = "clustermesh-6-vnet" + vnet_address_space = "10.6.0.0/16" + subnet = [ + { + name = "clustermesh-6-node" + address_prefix = "10.6.0.0/24" + }, + { + name = "clustermesh-6-pod" + address_prefix = "10.6.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-7" + vnet_name = "clustermesh-7-vnet" + vnet_address_space = "10.7.0.0/16" + subnet = [ + { + name = "clustermesh-7-node" + address_prefix = "10.7.0.0/24" + }, + { + name = "clustermesh-7-pod" + address_prefix = "10.7.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-8" + vnet_name = "clustermesh-8-vnet" + vnet_address_space = "10.8.0.0/16" + subnet = [ + { + name = "clustermesh-8-node" + address_prefix = "10.8.0.0/24" + }, + { + name = "clustermesh-8-pod" + address_prefix = "10.8.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-9" + vnet_name = "clustermesh-9-vnet" + vnet_address_space = "10.9.0.0/16" + subnet = [ + { + name = "clustermesh-9-node" + address_prefix = "10.9.0.0/24" + }, + { + name = "clustermesh-9-pod" + address_prefix = "10.9.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-10" + vnet_name = "clustermesh-10-vnet" + vnet_address_space = "10.10.0.0/16" + subnet = [ + { + name = "clustermesh-10-node" + address_prefix = "10.10.0.0/24" + }, + { + name = "clustermesh-10-pod" + address_prefix = "10.10.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-11" + vnet_name = "clustermesh-11-vnet" + vnet_address_space = "10.11.0.0/16" + subnet = [ + { + name = "clustermesh-11-node" + address_prefix = "10.11.0.0/24" + }, + { + name = "clustermesh-11-pod" + address_prefix = "10.11.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-12" + vnet_name = "clustermesh-12-vnet" + vnet_address_space = "10.12.0.0/16" + subnet = [ + { + name = "clustermesh-12-node" + address_prefix = "10.12.0.0/24" + }, + { + name = "clustermesh-12-pod" + address_prefix = "10.12.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-13" + vnet_name = "clustermesh-13-vnet" + vnet_address_space = "10.13.0.0/16" + subnet = [ + { + name = "clustermesh-13-node" + address_prefix = "10.13.0.0/24" + }, + { + name = "clustermesh-13-pod" + address_prefix = "10.13.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-14" + vnet_name = "clustermesh-14-vnet" + vnet_address_space = "10.14.0.0/16" + subnet = [ + { + name = "clustermesh-14-node" + address_prefix = "10.14.0.0/24" + }, + { + name = "clustermesh-14-pod" + address_prefix = "10.14.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-15" + vnet_name = "clustermesh-15-vnet" + vnet_address_space = "10.15.0.0/16" + subnet = [ + { + name = "clustermesh-15-node" + address_prefix = "10.15.0.0/24" + }, + { + name = "clustermesh-15-pod" + address_prefix = "10.15.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-16" + vnet_name = "clustermesh-16-vnet" + vnet_address_space = "10.16.0.0/16" + subnet = [ + { + name = "clustermesh-16-node" + address_prefix = "10.16.0.0/24" + }, + { + name = "clustermesh-16-pod" + address_prefix = "10.16.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-17" + vnet_name = "clustermesh-17-vnet" + vnet_address_space = "10.17.0.0/16" + subnet = [ + { + name = "clustermesh-17-node" + address_prefix = "10.17.0.0/24" + }, + { + name = "clustermesh-17-pod" + address_prefix = "10.17.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-18" + vnet_name = "clustermesh-18-vnet" + vnet_address_space = "10.18.0.0/16" + subnet = [ + { + name = "clustermesh-18-node" + address_prefix = "10.18.0.0/24" + }, + { + name = "clustermesh-18-pod" + address_prefix = "10.18.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-19" + vnet_name = "clustermesh-19-vnet" + vnet_address_space = "10.19.0.0/16" + subnet = [ + { + name = "clustermesh-19-node" + address_prefix = "10.19.0.0/24" + }, + { + name = "clustermesh-19-pod" + address_prefix = "10.19.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-20" + vnet_name = "clustermesh-20-vnet" + vnet_address_space = "10.20.0.0/16" + subnet = [ + { + name = "clustermesh-20-node" + address_prefix = "10.20.0.0/24" + }, + { + name = "clustermesh-20-pod" + address_prefix = "10.20.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_cli_config_list = [ + { + role = "mesh-1" + aks_name = "clustermesh-1" + sku_tier = "Standard" + subnet_name = "clustermesh-1-node" + pod_subnet_name = "clustermesh-1-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-2" + aks_name = "clustermesh-2" + sku_tier = "Standard" + subnet_name = "clustermesh-2-node" + pod_subnet_name = "clustermesh-2-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-3" + aks_name = "clustermesh-3" + sku_tier = "Standard" + subnet_name = "clustermesh-3-node" + pod_subnet_name = "clustermesh-3-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-4" + aks_name = "clustermesh-4" + sku_tier = "Standard" + subnet_name = "clustermesh-4-node" + pod_subnet_name = "clustermesh-4-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-5" + aks_name = "clustermesh-5" + sku_tier = "Standard" + subnet_name = "clustermesh-5-node" + pod_subnet_name = "clustermesh-5-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-6" + aks_name = "clustermesh-6" + sku_tier = "Standard" + subnet_name = "clustermesh-6-node" + pod_subnet_name = "clustermesh-6-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-7" + aks_name = "clustermesh-7" + sku_tier = "Standard" + subnet_name = "clustermesh-7-node" + pod_subnet_name = "clustermesh-7-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-8" + aks_name = "clustermesh-8" + sku_tier = "Standard" + subnet_name = "clustermesh-8-node" + pod_subnet_name = "clustermesh-8-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-9" + aks_name = "clustermesh-9" + sku_tier = "Standard" + subnet_name = "clustermesh-9-node" + pod_subnet_name = "clustermesh-9-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-10" + aks_name = "clustermesh-10" + sku_tier = "Standard" + subnet_name = "clustermesh-10-node" + pod_subnet_name = "clustermesh-10-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-11" + aks_name = "clustermesh-11" + sku_tier = "Standard" + subnet_name = "clustermesh-11-node" + pod_subnet_name = "clustermesh-11-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-12" + aks_name = "clustermesh-12" + sku_tier = "Standard" + subnet_name = "clustermesh-12-node" + pod_subnet_name = "clustermesh-12-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-13" + aks_name = "clustermesh-13" + sku_tier = "Standard" + subnet_name = "clustermesh-13-node" + pod_subnet_name = "clustermesh-13-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-14" + aks_name = "clustermesh-14" + sku_tier = "Standard" + subnet_name = "clustermesh-14-node" + pod_subnet_name = "clustermesh-14-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-15" + aks_name = "clustermesh-15" + sku_tier = "Standard" + subnet_name = "clustermesh-15-node" + pod_subnet_name = "clustermesh-15-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-16" + aks_name = "clustermesh-16" + sku_tier = "Standard" + subnet_name = "clustermesh-16-node" + pod_subnet_name = "clustermesh-16-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-17" + aks_name = "clustermesh-17" + sku_tier = "Standard" + subnet_name = "clustermesh-17-node" + pod_subnet_name = "clustermesh-17-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-18" + aks_name = "clustermesh-18" + sku_tier = "Standard" + subnet_name = "clustermesh-18-node" + pod_subnet_name = "clustermesh-18-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-19" + aks_name = "clustermesh-19" + sku_tier = "Standard" + subnet_name = "clustermesh-19-node" + pod_subnet_name = "clustermesh-19-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-20" + aks_name = "clustermesh-20" + sku_tier = "Standard" + subnet_name = "clustermesh-20-node" + pod_subnet_name = "clustermesh-20-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + } +] + +# ============================================================================= +# Fleet + ClusterMesh +# ============================================================================= +vnet_peering_config = { + enabled = true +} + +fleet_config = { + enabled = true + fleet_name = "clustermesh-flt" + cmp_name = "clustermesh-cmp" + member_label_key = "mesh" + member_label_value = "true" + members = [ + { member_name = "mesh-1", aks_role = "mesh-1" }, + { member_name = "mesh-2", aks_role = "mesh-2" }, + { member_name = "mesh-3", aks_role = "mesh-3" }, + { member_name = "mesh-4", aks_role = "mesh-4" }, + { member_name = "mesh-5", aks_role = "mesh-5" }, + { member_name = "mesh-6", aks_role = "mesh-6" }, + { member_name = "mesh-7", aks_role = "mesh-7" }, + { member_name = "mesh-8", aks_role = "mesh-8" }, + { member_name = "mesh-9", aks_role = "mesh-9" }, + { member_name = "mesh-10", aks_role = "mesh-10" }, + { member_name = "mesh-11", aks_role = "mesh-11" }, + { member_name = "mesh-12", aks_role = "mesh-12" }, + { member_name = "mesh-13", aks_role = "mesh-13" }, + { member_name = "mesh-14", aks_role = "mesh-14" }, + { member_name = "mesh-15", aks_role = "mesh-15" }, + { member_name = "mesh-16", aks_role = "mesh-16" }, + { member_name = "mesh-17", aks_role = "mesh-17" }, + { member_name = "mesh-18", aks_role = "mesh-18" }, + { member_name = "mesh-19", aks_role = "mesh-19" }, + { member_name = "mesh-20", aks_role = "mesh-20" } + ] +} diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars new file mode 100644 index 0000000000..d36788938a --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars @@ -0,0 +1,314 @@ +scenario_type = "perf-eval" +scenario_name = "clustermesh-scale" +deletion_delay = "4h" +owner = "aks" + +# ============================================================================= +# ClusterMesh Scale Test — 5 cluster tier +# +# Same shape as azure-2.tfvars (see that file for full sizing rationale on +# pod CIDR, max-pods, prompool, etc.). This file scales the cluster count +# only; per-cluster sizing is identical to the n2 tier so cluster-count is +# the only variable when comparing tier results. +# +# Generated topology: +# - 5 VNets (one per cluster) at 10..0.0/16, id=1..5 +# - 5 AKS clusters (Cilium+ACNS, Azure CNI w/ pod subnet) +# - 20 VNet peering links (N*(N-1) at separate-VNet mode) +# - 5 Fleet members (label mesh=true) + 1 clustermeshprofile +# +# Subscription footprint per run (20-node baseline per spec line 24): +# - default pool: 5 clusters x 20 nodes x D4s_v3 (4 vCPU) = 400 vCPU (DSv3 family) +# - prompool: 5 clusters x 1 node x D8s_v3 (8 vCPU) = 40 vCPU (DSv3 family) +# - total DSv3 compute: 440 vCPU +# Verify region quota before first run (DSv3 limit is typically 5000 vCPU +# in eastus2euap; check `az vm list-usage --location eastus2euap`). +# ============================================================================= + +network_config_list = [ + { + role = "mesh-1" + vnet_name = "clustermesh-1-vnet" + vnet_address_space = "10.1.0.0/16" + subnet = [ + { + name = "clustermesh-1-node" + address_prefix = "10.1.0.0/24" + }, + { + name = "clustermesh-1-pod" + address_prefix = "10.1.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-2" + vnet_name = "clustermesh-2-vnet" + vnet_address_space = "10.2.0.0/16" + subnet = [ + { + name = "clustermesh-2-node" + address_prefix = "10.2.0.0/24" + }, + { + name = "clustermesh-2-pod" + address_prefix = "10.2.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-3" + vnet_name = "clustermesh-3-vnet" + vnet_address_space = "10.3.0.0/16" + subnet = [ + { + name = "clustermesh-3-node" + address_prefix = "10.3.0.0/24" + }, + { + name = "clustermesh-3-pod" + address_prefix = "10.3.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-4" + vnet_name = "clustermesh-4-vnet" + vnet_address_space = "10.4.0.0/16" + subnet = [ + { + name = "clustermesh-4-node" + address_prefix = "10.4.0.0/24" + }, + { + name = "clustermesh-4-pod" + address_prefix = "10.4.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-5" + vnet_name = "clustermesh-5-vnet" + vnet_address_space = "10.5.0.0/16" + subnet = [ + { + name = "clustermesh-5-node" + address_prefix = "10.5.0.0/24" + }, + { + name = "clustermesh-5-pod" + address_prefix = "10.5.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_cli_config_list = [ + { + role = "mesh-1" + aks_name = "clustermesh-1" + sku_tier = "Standard" + subnet_name = "clustermesh-1-node" + pod_subnet_name = "clustermesh-1-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-2" + aks_name = "clustermesh-2" + sku_tier = "Standard" + subnet_name = "clustermesh-2-node" + pod_subnet_name = "clustermesh-2-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-3" + aks_name = "clustermesh-3" + sku_tier = "Standard" + subnet_name = "clustermesh-3-node" + pod_subnet_name = "clustermesh-3-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-4" + aks_name = "clustermesh-4" + sku_tier = "Standard" + subnet_name = "clustermesh-4-node" + pod_subnet_name = "clustermesh-4-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-5" + aks_name = "clustermesh-5" + sku_tier = "Standard" + subnet_name = "clustermesh-5-node" + pod_subnet_name = "clustermesh-5-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 20 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + } +] + +# ============================================================================= +# Fleet + ClusterMesh +# ============================================================================= +vnet_peering_config = { + enabled = true +} + +fleet_config = { + enabled = true + fleet_name = "clustermesh-flt" + cmp_name = "clustermesh-cmp" + member_label_key = "mesh" + member_label_value = "true" + members = [ + { member_name = "mesh-1", aks_role = "mesh-1" }, + { member_name = "mesh-2", aks_role = "mesh-2" }, + { member_name = "mesh-3", aks_role = "mesh-3" }, + { member_name = "mesh-4", aks_role = "mesh-4" }, + { member_name = "mesh-5", aks_role = "mesh-5" } + ] +} diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-10.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-10.json new file mode 100644 index 0000000000..0e2fd02aef --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-10.json @@ -0,0 +1,4 @@ +{ + "run_id": "cmesh10test", + "region": "westus2" +} diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-20.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-20.json new file mode 100644 index 0000000000..fab49e54a0 --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-20.json @@ -0,0 +1,4 @@ +{ + "run_id": "cmesh20test", + "region": "westus2" +} diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-5.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-5.json new file mode 100644 index 0000000000..6604113763 --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-5.json @@ -0,0 +1,4 @@ +{ + "run_id": "cmesh5test", + "region": "westus2" +} diff --git a/steps/engine/clusterloader2/clustermesh-scale/collect.yml b/steps/engine/clusterloader2/clustermesh-scale/collect.yml index 6a879a2c58..f6684d297c 100644 --- a/steps/engine/clusterloader2/clustermesh-scale/collect.yml +++ b/steps/engine/clusterloader2/clustermesh-scale/collect.yml @@ -26,55 +26,213 @@ steps: export MESH_SIZE="${MESH_SIZE:-$CLUSTERMESH_COUNT}" export TEST_TYPE="${TEST_TYPE:-default-config}" export TRIGGER_REASON="${TRIGGER_REASON:-$BUILD_REASON}" + # Phase 4a — pod-churn knobs recorded in each JSONL row so Kusto can + # filter/group on the exact stressor parameters. Non-churn matrix + # entries leave these unset → fall back to 0/"" defaults that + # scale.py collect treats as "not a churn run". + export CL2_CHURN_CYCLES="${CHURN_CYCLES:-0}" + export CL2_CHURN_UP_DURATION="${CHURN_UP_DURATION:-}" + export CL2_CHURN_DOWN_DURATION="${CHURN_DOWN_DURATION:-}" + export CL2_KILL_DURATION_SECONDS="${KILL_DURATION_SECONDS:-0}" + export CL2_KILL_INTERVAL_SECONDS="${KILL_INTERVAL_SECONDS:-0}" + export CL2_KILL_BATCH="${KILL_BATCH:-0}" + # Phase 4b — Scenario #5 (Multi-Cluster Failure Isolation) target context. + # Reused from scenario #4 by convention; used here to special-case the + # per-cluster churn knobs (only the target row carries non-zero kill + # values; peer rows carry zeros even though the share-infra scenario + # was configured with churn knobs). + export CL2_APISERVER_KILL_TARGET_CONTEXT="${APISERVER_KILL_TARGET_CONTEXT:-clustermesh-1}" + # Phase 4b — Scenario #6 (Upper Bound / Saturation) collect knobs. + # Default to empty string so non-saturation test_types skip the + # classifier entirely (zero overhead). For upper-bound test_types, + # the matrix sets these → scale.py collect emits SaturationRung + + # SaturationSummary rows tagging which signal tripped per rung. + export CL2_SATURATION_QPS_LIST="${SATURATION_QPS_LIST:-}" + export CL2_SATURATION_RESTARTS_LIST="${SATURATION_RESTARTS_LIST:-}" clusters=$(cat "$HOME/.kube/clustermesh-clusters.json") cluster_count=$(echo "$clusters" | jq 'length') # Aggregate every per-cluster JSONL into a single TEST_RESULTS_FILE. - # Each line carries `cluster: ` so downstream Kusto queries can - # group/filter by cluster across the mesh. + # Each line carries `cluster: ` and `test_type: ` so + # downstream Kusto queries can group/filter by cluster AND scenario + # across the mesh. mkdir -p "$(dirname "$TEST_RESULTS_FILE")" : > "$TEST_RESULTS_FILE" - for row in $(echo "$clusters" | jq -c '.[]'); do - role=$(echo "$row" | jq -r '.role') - report_dir="${CL2_REPORT_DIR}/${role}" - - if [ ! -d "$report_dir" ]; then - echo "##vso[task.logissue type=warning;] $role: missing report dir $report_dir, skipping" - continue + # Helper: collect one (scenario, cluster) pair. Args: + # $1 scenario name (also used as test_type) + # $2 cluster role + # $3 per-cluster CL2 report dir (already includes scenario subdir + # in share-infra mode; just / in single + # scenario mode) + # $4 result file path + # $5 churn_cycles value (0 to record "not a churn scenario") + # $6 churn_up_duration value ("" to record "not a churn scenario") + # $7 churn_down_duration value + # $8 kill_duration_seconds value + # $9 kill_interval_seconds value + # $10 kill_batch value + # $11 scenario_start_timestamp value + # $12 saturation_qps_list value ("" for non-saturation scenarios) + # $13 saturation_restarts_list value ("" for non-saturation scenarios) + collect_one() { + local _scen="$1" _role="$2" _report="$3" _out="$4" + local _cc="$5" _cu="$6" _cd="$7" _kds="$8" _kis="$9" _kb="${10}" _st="${11}" + local _sqps="${12:-}" _sres="${13:-}" + if [ ! -d "$_report" ]; then + echo "##vso[task.logissue type=warning;] $_scen/$_role: missing report dir $_report, skipping" + return 1 fi - - # If CL2 errored out before producing junit.xml (e.g. prometheus stack - # setup timeout), skip aggregation for this cluster — scale.py collect - # would crash on the missing file. The execute step already logged a - # warning per-cluster; we don't want to also abort the whole pipeline - # at collect time when partial data may be useful. - if [ ! -f "$report_dir/junit.xml" ]; then - echo "##vso[task.logissue type=warning;] $role: $report_dir/junit.xml not found (CL2 likely failed); skipping collect for this cluster" - continue + if [ ! -f "$_report/junit.xml" ]; then + echo "##vso[task.logissue type=warning;] $_scen/$_role: $_report/junit.xml not found (CL2 likely failed); skipping collect" + return 1 fi - - per_cluster_result="${TEST_RESULTS_FILE%.*}.${role}.${TEST_RESULTS_FILE##*.}" - + local _rc=0 PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ - --cl2_report_dir "$report_dir" \ + --cl2_report_dir "$_report" \ --cloud_info "${CLOUD_INFO:-}" \ --run_id "$RUN_ID" \ --run_url "$RUN_URL" \ - --result_file "$per_cluster_result" \ - --start_timestamp "$START_TIME" \ - --cluster-name "$role" \ + --result_file "$_out" \ + --start_timestamp "$_st" \ + --cluster-name "$_role" \ --cluster-count "$cluster_count" \ --mesh-size "$MESH_SIZE" \ - --test_type "$TEST_TYPE" \ + --test_type "$_scen" \ --namespaces "$CL2_NAMESPACES" \ --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \ --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \ - --trigger_reason "${TRIGGER_REASON:-}" + --churn-cycles "$_cc" \ + --churn-up-duration "$_cu" \ + --churn-down-duration "$_cd" \ + --kill-duration-seconds "$_kds" \ + --kill-interval-seconds "$_kis" \ + --kill-batch "$_kb" \ + --saturation-qps-list "$_sqps" \ + --saturation-restarts-list "$_sres" \ + --trigger_reason "${TRIGGER_REASON:-}" || _rc=$? + if [ "$_rc" -ne 0 ]; then + echo "##vso[task.logissue type=warning;] $_scen/$_role: scale.py collect exited $_rc; skipping aggregation" + return 1 + fi + if [ ! -f "$_out" ]; then + echo "##vso[task.logissue type=warning;] $_scen/$_role: per-cluster result file $_out missing after collect; skipping" + return 1 + fi + return 0 + } + + # Helper: set the 7 collect arg vars (cc/cu/cd/kds/kis/kb/st) for a + # given scenario name. For pod-churn-* scenarios, use the matrix-exported + # CL2_CHURN_* / CL2_KILL_* values directly. For non-churn scenarios + # (event-throughput, default-config), emit zeros/empties so the JSONL + # doesn't mis-tag those rows. + # + # Implementation note: an earlier version used `IFS=$'\t' read` to parse + # tab-separated values from a printf string. That was buggy because tab + # is whitespace-IFS and bash collapses consecutive tabs into a single + # delimiter — non-churn scenarios (which had empty cu/cd fields) ended + # up with shifted values. Direct assignment avoids that pitfall. + # + # Also sets sqps/sres for upper-bound (Scenario #6). These vars are + # passed to collect_one as $12/$13; saturation classifier in scale.py + # collect skips when sqps is empty (non-upper-bound scenarios). + set_churn_args_for_scenario() { + local _scen="$1" _st="$2" + case "$_scen" in + pod-churn-*) + cc="$CL2_CHURN_CYCLES" + cu="$CL2_CHURN_UP_DURATION" + cd_v="$CL2_CHURN_DOWN_DURATION" + kds="$CL2_KILL_DURATION_SECONDS" + kis="$CL2_KILL_INTERVAL_SECONDS" + kb="$CL2_KILL_BATCH" + sqps="" + sres="" + ;; + upper-bound) + cc=0 + cu="" + cd_v="" + kds=0 + kis=0 + kb=0 + sqps="$CL2_SATURATION_QPS_LIST" + sres="$CL2_SATURATION_RESTARTS_LIST" + ;; + *) + cc=0 + cu="" + cd_v="" + kds=0 + kis=0 + kb=0 + sqps="" + sres="" + ;; + esac + st="$_st" + } - cat "$per_cluster_result" >> "$TEST_RESULTS_FILE" - done + # Share-infra mode: SHARE_INFRA_META is a JSON array of + # {scenario, start_timestamp} produced by execute.yml. Iterate + # per-scenario × per-cluster, aggregating ALL rows into one blob with + # per-row test_type attribution. + if [ -n "${SHARE_INFRA_META:-}" ] && [ -f "$SHARE_INFRA_META" ]; then + echo "Share-infra collect: reading scenarios from $SHARE_INFRA_META" + scenarios_json=$(cat "$SHARE_INFRA_META") + for sn in $(echo "$scenarios_json" | jq -c '.[]'); do + SCENARIO=$(echo "$sn" | jq -r '.scenario') + SCENARIO_START=$(echo "$sn" | jq -r '.start_timestamp') + echo "----- collecting scenario: $SCENARIO (start=$SCENARIO_START) -----" + set_churn_args_for_scenario "$SCENARIO" "$SCENARIO_START" + for row in $(echo "$clusters" | jq -c '.[]'); do + role=$(echo "$row" | jq -r '.role') + name=$(echo "$row" | jq -r '.name') + report_dir="${CL2_REPORT_DIR}/${SCENARIO}/${role}" + per_cluster_result="${TEST_RESULTS_FILE%.*}.${SCENARIO}.${role}.${TEST_RESULTS_FILE##*.}" + # Phase 4b — Scenario #5 (Isolation) per-cluster churn-knob + # override: only the TARGET cluster's row gets actual kill knobs; + # peer rows stay at zeros (default). This honestly represents + # "kill duration/interval/batch describe what THIS cluster did", + # not "what the scenario was configured to do globally". + # + # The matrix-exported APISERVER_KILL_TARGET_CONTEXT (default + # clustermesh-1) is compared against the cluster's `name` field + # from the discovered-clusters JSON (AKS resource name = kubectl + # context name set by `az aks get-credentials`). + cc_row="$cc"; cu_row="$cu"; cd_row="$cd_v" + kds_row="$kds"; kis_row="$kis"; kb_row="$kb" + if [ "$SCENARIO" = "isolation" ] && [ "$name" = "$CL2_APISERVER_KILL_TARGET_CONTEXT" ]; then + cc_row=0 + cu_row="" + cd_row="" + kds_row="$CL2_KILL_DURATION_SECONDS" + kis_row="$CL2_KILL_INTERVAL_SECONDS" + kb_row="$CL2_KILL_BATCH" + fi + if collect_one "$SCENARIO" "$role" "$report_dir" "$per_cluster_result" \ + "$cc_row" "$cu_row" "$cd_row" "$kds_row" "$kis_row" "$kb_row" "$st" \ + "$sqps" "$sres"; then + cat "$per_cluster_result" >> "$TEST_RESULTS_FILE" + fi + done + done + else + # Single-scenario mode (prod path — unchanged behavior). + set_churn_args_for_scenario "$TEST_TYPE" "$START_TIME" + for row in $(echo "$clusters" | jq -c '.[]'); do + role=$(echo "$row" | jq -r '.role') + report_dir="${CL2_REPORT_DIR}/${role}" + per_cluster_result="${TEST_RESULTS_FILE%.*}.${role}.${TEST_RESULTS_FILE##*.}" + if collect_one "$TEST_TYPE" "$role" "$report_dir" "$per_cluster_result" \ + "$cc" "$cu" "$cd_v" "$kds" "$kis" "$kb" "$st" \ + "$sqps" "$sres"; then + cat "$per_cluster_result" >> "$TEST_RESULTS_FILE" + fi + done + fi echo "Aggregated results from $cluster_count clusters into $TEST_RESULTS_FILE" wc -l "$TEST_RESULTS_FILE" || true diff --git a/steps/engine/clusterloader2/clustermesh-scale/execute.yml b/steps/engine/clusterloader2/clustermesh-scale/execute.yml index cd82bc2d70..fc99f552aa 100644 --- a/steps/engine/clusterloader2/clustermesh-scale/execute.yml +++ b/steps/engine/clusterloader2/clustermesh-scale/execute.yml @@ -40,6 +40,48 @@ steps: export CL2_HOLD_DURATION="$HOLD_DURATION" export CL2_WARMUP_DURATION="$WARMUP_DURATION" export CL2_RESTART_GENERATION="$RESTART_COUNT" + # Phase 4a — Scenario #2 (Pod Churn Stress) knobs. Shell defaults so + # matrix entries that don't set these (event-throughput, default-config) + # silently fall back to the documented Phase 4a defaults rather than + # passing empty strings to argparse type=int. Pod-churn matrix entries + # set these explicitly via auto-exported uppercase matrix vars. + export CL2_CHURN_CYCLES="${CHURN_CYCLES:-5}" + export CL2_CHURN_UP_DURATION="${CHURN_UP_DURATION:-60s}" + export CL2_CHURN_DOWN_DURATION="${CHURN_DOWN_DURATION:-60s}" + export CL2_KILL_DURATION="${KILL_DURATION:-10m}" + export CL2_KILL_INTERVAL_SECONDS="${KILL_INTERVAL_SECONDS:-10}" + export CL2_KILL_BATCH="${KILL_BATCH:-5}" + export CL2_KILL_DURATION_SECONDS="${KILL_DURATION_SECONDS:-600}" + export CL2_KILL_JOB_DEADLINE_SECONDS="${KILL_JOB_DEADLINE_SECONDS:-660}" + # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs. + export CL2_APISERVER_KILL_TARGET_CONTEXT="${APISERVER_KILL_TARGET_CONTEXT:-clustermesh-1}" + export CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS="${APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS:-240}" + export CL2_APISERVER_KILL_OBSERVATION_SECONDS="${APISERVER_KILL_OBSERVATION_SECONDS:-60}" + # Phase 4b — Scenario #7 (HA Configuration Validation) knob. + export CL2_HA_CONFIG_REPLICAS="${HA_CONFIG_REPLICAS:-3}" + # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs. + # node-churner.sh (driven from this script, NOT Method:Exec — see + # config/node-churner.sh header for the design rationale) consumes + # these directly. scale.py configure also writes them into overrides.yaml + # so CL2 templates that reference CL2_NODE_CHURN_* can use them. + export CL2_NODE_CHURN_TARGET_CONTEXT="${NODE_CHURN_TARGET_CONTEXT:-${APISERVER_KILL_TARGET_CONTEXT:-clustermesh-1}}" + export CL2_NODE_CHURN_CYCLES="${NODE_CHURN_CYCLES:-3}" + export CL2_NODE_CHURN_DELTA="${NODE_CHURN_DELTA:-5}" + export CL2_NODE_CHURN_SETTLE_SECONDS="${NODE_CHURN_SETTLE_SECONDS:-60}" + export CL2_NODE_CHURN_SCALE_DURATION_SECONDS="${NODE_CHURN_SCALE_DURATION_SECONDS:-1800}" + export CL2_NODE_CHURN_REPLACE_DURATION_SECONDS="${NODE_CHURN_REPLACE_DURATION_SECONDS:-1500}" + export CL2_NODE_CHURN_COMBINED_DURATION_SECONDS="${NODE_CHURN_COMBINED_DURATION_SECONDS:-3300}" + export CL2_NODE_REPLACE_BATCH_SIZE="${NODE_REPLACE_BATCH_SIZE:-10}" + export CL2_NODE_CHURN_READY_TIMEOUT_SECONDS="${NODE_CHURN_READY_TIMEOUT_SECONDS:-300}" + # Phase 4b — Scenario #6 (Upper Bound / Saturation) knobs. + # upper-bound.yaml consumes these via CL2's DefaultParam template + # func; non-saturation scenarios ignore them. Defaults mirror + # scale.py configure's defaults so a forgotten matrix var falls + # through to the documented 5-rung sweep at 100/500/1500/4000/10000 QPS. + export CL2_SATURATION_QPS_LIST="${SATURATION_QPS_LIST:-100,500,1500,4000,10000}" + export CL2_SATURATION_RESTARTS_LIST="${SATURATION_RESTARTS_LIST:-2,4,8,15,25}" + export CL2_SATURATION_RUNG_DURATION_SECONDS="${SATURATION_RUNG_DURATION_SECONDS:-240}" + export CL2_SATURATION_SETTLE_SECONDS="${SATURATION_SETTLE_SECONDS:-90}" # Same discovery pattern as topology/clustermesh-scale/validate-resources.yml. # We re-run it here rather than relying on a step variable so this engine @@ -58,7 +100,25 @@ steps: echo "Running CL2 across $cluster_count clusters" mkdir -p "$HOME/.kube" - echo "$clusters" > "$HOME/.kube/clustermesh-clusters.json" + # Pre-fetch all kubeconfigs sequentially. This is fast (<5s/cluster) and + # keeps the parallel CL2 fan-out below from racing on `az aks + # get-credentials` writes to ~/.azure (MSAL token cache shared across + # all subsequent CL2 docker containers). + for row in $(echo "$clusters" | jq -c '.[]'); do + name=$(echo "$row" | jq -r '.name') + rg=$(echo "$row" | jq -r '.rg') + role=$(echo "$row" | jq -r '.role') + kubeconfig="$HOME/.kube/$role.config" + KUBECONFIG="$kubeconfig" az aks get-credentials \ + --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors + done + + # Augment clusters JSON with the per-cluster kubeconfig path, then write + # the file consumed by both this step (for parallel fan-out) and + # collect.yml (which only reads role/name/rg and ignores extra fields). + clusters_with_kubeconfig=$(echo "$clusters" | jq --arg home "$HOME" \ + '[.[] | . + {kubeconfig: ($home + "/.kube/" + .role + ".config")}]') + echo "$clusters_with_kubeconfig" > "$HOME/.kube/clustermesh-clusters.json" echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$cluster_count" # CL2 overrides are written once — params are identical for every cluster @@ -68,128 +128,609 @@ steps: --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \ --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \ --operation-timeout "${CL2_OPERATION_TIMEOUT:-15m}" \ + --churn-cycles "$CL2_CHURN_CYCLES" \ + --churn-up-duration "$CL2_CHURN_UP_DURATION" \ + --churn-down-duration "$CL2_CHURN_DOWN_DURATION" \ + --kill-duration "$CL2_KILL_DURATION" \ + --kill-interval-seconds "$CL2_KILL_INTERVAL_SECONDS" \ + --kill-batch "$CL2_KILL_BATCH" \ + --kill-duration-seconds "$CL2_KILL_DURATION_SECONDS" \ + --kill-job-deadline-seconds "$CL2_KILL_JOB_DEADLINE_SECONDS" \ + --apiserver-kill-target-context "$CL2_APISERVER_KILL_TARGET_CONTEXT" \ + --apiserver-kill-recovery-timeout-seconds "$CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS" \ + --apiserver-kill-observation-seconds "$CL2_APISERVER_KILL_OBSERVATION_SECONDS" \ + --ha-config-replicas "$CL2_HA_CONFIG_REPLICAS" \ + --node-churn-target-context "$CL2_NODE_CHURN_TARGET_CONTEXT" \ + --node-churn-cycles "$CL2_NODE_CHURN_CYCLES" \ + --node-churn-delta "$CL2_NODE_CHURN_DELTA" \ + --node-churn-settle-seconds "$CL2_NODE_CHURN_SETTLE_SECONDS" \ + --node-churn-scale-duration-seconds "$CL2_NODE_CHURN_SCALE_DURATION_SECONDS" \ + --node-churn-replace-duration-seconds "$CL2_NODE_CHURN_REPLACE_DURATION_SECONDS" \ + --node-churn-combined-duration-seconds "$CL2_NODE_CHURN_COMBINED_DURATION_SECONDS" \ + --node-replace-batch-size "$CL2_NODE_REPLACE_BATCH_SIZE" \ + --node-churn-ready-timeout-seconds "$CL2_NODE_CHURN_READY_TIMEOUT_SECONDS" \ + --saturation-qps-list "$CL2_SATURATION_QPS_LIST" \ + --saturation-restarts-list "$CL2_SATURATION_RESTARTS_LIST" \ + --saturation-rung-duration-seconds "$CL2_SATURATION_RUNG_DURATION_SECONDS" \ + --saturation-settle-seconds "$CL2_SATURATION_SETTLE_SECONDS" \ --cl2_override_file "${CL2_CONFIG_DIR}/overrides.yaml" - # Per-cluster CL2 fan-out — sequential. Each invocation writes its own - # report dir at ${CL2_REPORT_DIR}//, so collect.yml can iterate the - # same way and tag results with --cluster-name. - failures=0 - for row in $(echo "$clusters" | jq -c '.[]'); do - name=$(echo "$row" | jq -r '.name') - rg=$(echo "$row" | jq -r '.rg') - role=$(echo "$row" | jq -r '.role') + # Phase 4a — pre-stage kubectl into the CL2 config dir so the + # pod-churn-killer.sh script (invoked via Method: Exec from inside + # the CL2 docker container) has a working kubectl binary regardless + # of whether the CL2 image bundles one. The cl2_config_dir is + # bind-mounted by run_cl2_command at /root/perf-tests/clusterloader2/config, + # so $CL2_CONFIG_DIR/kubectl on the host becomes accessible at + # /root/perf-tests/clusterloader2/config/kubectl inside the container. + # + # Why this lives in execute.yml rather than the Dockerfile: we don't + # control the CL2 image build (ghcr.io/azure/clusterloader2). Method: + # Exec is the only host-side hook CL2 exposes inside a test run. + # AzDO agents have curl + internet egress to dl.k8s.io (Kubernetes' + # canonical release host). + # + # Non-fatal: a curl failure here logs a warning but does NOT abort + # the step. pod-churn-killer.sh's preflight check exits 127 if the + # binary is missing, which CL2 records as a single measurement + # failure — scale-cycle data still lands cleanly. + if [ ! -x "${CL2_CONFIG_DIR}/kubectl" ]; then + KUBECTL_VERSION="${KUBECTL_VERSION:-v1.30.0}" + echo "Pre-staging kubectl ${KUBECTL_VERSION} for in-container use by Method: Exec scripts" + if curl -sfL -o "${CL2_CONFIG_DIR}/kubectl" \ + "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"; then + chmod 0755 "${CL2_CONFIG_DIR}/kubectl" + "${CL2_CONFIG_DIR}/kubectl" version --client=true --output=yaml | head -3 || true + else + echo "##vso[task.logissue type=warning;] kubectl pre-stage download failed; pod-churn kill phase will fail-soft (script's fallback path)" + rm -f "${CL2_CONFIG_DIR}/kubectl" + fi + fi - echo "====================================================================" - echo " Running CL2 on $role ($name)" - echo "====================================================================" + # Phase 4b — pre-pull CL2 docker image ONCE on the agent before + # parallel fan-out. Without this, scale.py execute-parallel spawns up + # to CL2_MAX_CONCURRENT (default 4, dev pipeline 8) `docker run` + # commands simultaneously, each of which independently pulls + # ghcr.io/azure/clusterloader2:. The parallel pull race against + # ghcr.io's anonymous-rate limit caused mesh-13's CL2 step to fail + # in build 67013 with `context deadline exceeded` on the token + # endpoint. Pre-pulling once means the parallel `docker run`s see + # the image cached locally and skip the pull entirely. + # + # Best-effort: `docker pull` failure here triggers a warning + lets + # the parallel-fanout retry on its own. Most runs will benefit from + # the cache hit; failures behave no worse than before. + echo "Pre-pulling CL2 image ${CL2_IMAGE} on the AzDO agent (sidesteps ghcr.io rate-limit race during parallel fanout)..." + if docker pull "${CL2_IMAGE}" 2>&1 | tail -5; then + echo "Pre-pull succeeded; subsequent docker runs will hit local cache" + else + echo "##vso[task.logissue type=warning;] CL2 image pre-pull failed; per-cluster CL2 invocations will each attempt their own pull (ghcr.io rate-limit risk persists)" + fi - kubeconfig="$HOME/.kube/$role.config" - KUBECONFIG="$kubeconfig" az aks get-credentials \ - --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors + # CL2 execution: single-scenario (default, prod path) or share-infra + # multi-scenario loop (dev pipeline iteration). See plan.md Phase 4b + # section for the design rationale. + # + # Gating env var SHARE_INFRA_SCENARIOS — comma-separated list of CL2 + # config basenames (e.g. "event-throughput,pod-churn-combined"). When + # set, each entry runs sequentially against the same provisioned + # clusters with a 60s settle between scenarios. test_type per row in + # the JSONL is each scenario's own basename. When unset, fall through + # to the single-scenario invocation that prod pipeline expects. + overall_rc=0 - report_dir="${CL2_REPORT_DIR}/${role}" - mkdir -p "$report_dir" - - cl2_passed=0 - # Run CL2; collect outcome WITHOUT failing the bash script (so we can - # also inspect junit.xml for internal test failures even when CL2 exits - # 0). Treat as "passed" only if BOTH: - # (a) junit.xml exists (CL2 actually completed and wrote a report) - # (b) junit.xml has zero / elements - # Without (b) we'd silently green-light runs where measurements failed - # — e.g. PodMonitor template substitution producing "", which - # k8s admission rejects but CL2 still writes junit with tags. - PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ - --cl2-image "${CL2_IMAGE}" \ - --cl2-config-dir "${CL2_CONFIG_DIR}" \ - --cl2-report-dir "$report_dir" \ - --cl2-config-file "${CL2_CONFIG_FILE}" \ - --kubeconfig "$kubeconfig" \ - --provider "${CLOUD}" \ - || true - if [ -f "$report_dir/junit.xml" ]; then - # Count failure/error attrs from . - junit_failures=$(grep -oE 'failures="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0) - junit_errors=$(grep -oE 'errors="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0) - junit_failures=${junit_failures:-0} - junit_errors=${junit_errors:-0} - if [ "$junit_failures" -eq 0 ] && [ "$junit_errors" -eq 0 ]; then - cl2_passed=1 - else - echo "##vso[task.logissue type=warning;] $role: junit.xml reports failures=$junit_failures errors=$junit_errors" - fi + # Scenarios that REQUIRE every cluster's CL2 (and its Prometheus + # scrape window) to overlap the target's stimulus window — bumping + # max_concurrent to mesh_size means all clusters start CL2 + # simultaneously. Used for: + # - isolation: target's pod-churn kill loop runs ON target; + # peer Prometheus must scrape concurrently to + # prove peers stay flat. + # - node-churn-*: stimulus is OUTSIDE CL2 (host-side az aks + # nodepool scale / vmss delete-instances). The + # readiness barrier in node-churner.sh requires + # all clusters' CL2 sentinels to land before + # node ops start — that's only possible if all + # CL2's are running concurrently. + # - upper-bound: saturation testing measures per-cluster + # failure point under aggregate mesh load. If + # peers don't load concurrently, each cluster's + # reading understates the real saturation curve + # (mesh-wide propagation is a function of N×load, + # not load×1). Plus: the in-run rung loop is + # not coordinated across clusters — we accept + # that rung-r on cluster A may overlap rung-(r±1) + # on cluster B in wall-time; the per-rung + # suffix in measurement filenames keeps the + # data attribution clean. + needs_mesh_wide_concurrency() { + local _scen="$1" + case "$_scen" in + isolation|node-churn-scale|node-churn-replace|node-churn-combined|upper-bound) + return 0 + ;; + esac + return 1 + } + + # Scenarios that drive their stimulus via node-churner.sh on the AzDO + # agent (NOT Method:Exec). The launcher returns the PID; the caller + # `wait`s after execute-parallel completes so the timing file is + # finalized before collect runs. + is_node_churn_scenario() { + case "$1" in + node-churn-scale|node-churn-replace|node-churn-combined) return 0 ;; + esac + return 1 + } + + # Scenario #6 (Upper Bound / Saturation) predicate. Used to gate the + # proactive failure-diag dump (runs unconditionally for upper-bound + # like for node-churn, NOT just on rc!=0). User direction 2026-05-14: + # be proactive about debug dumps until scenario is end-to-end green; + # remove the unconditional gate once the first n=2 + n=20 are clean. + is_upper_bound_scenario() { + case "$1" in + upper-bound) return 0 ;; + esac + return 1 + } + + # Sentinel dir bind-mounted into every CL2 container at + # /root/perf-tests/clusterloader2/config/sentinels (CL2_CONFIG_DIR is + # bind-mounted at /root/perf-tests/clusterloader2/config). Per-cluster + # CL2 writes ready- when it enters the measurement + # window; node-churner.sh polls for $cluster_count sentinel files + # before starting the first nodepool op. Cleared per scenario so + # stale sentinels from a previous scenario don't fool the barrier. + SENTINEL_DIR="${CL2_CONFIG_DIR}/sentinels" + mkdir -p "$SENTINEL_DIR" + + # Launch node-churner.sh for the named scenario; populates + # NODE_CHURNER_PID. Caller must: + # - mkdir -p the per-cluster target report dir BEFORE calling so + # the churner has a writable place for NodeChurnTimings_*.json + # - call `wait $NODE_CHURNER_PID` after execute-parallel returns + # - unset NODE_CHURNER_PID after wait + launch_node_churner() { + local _scen="$1" _report_dir_base="$2" + # Discover target cluster + kubeconfig from the augmented clusters + # JSON written to $HOME/.kube/clustermesh-clusters.json. The shell + # `$clusters` var in this script is the EARLY discovery output + # WITHOUT the kubeconfig field; using it here gave node-churner an + # empty TARGET_KUBECONFIG arg in build 67126. + local _all _target_role _target_row + _all=$(cat "$HOME/.kube/clustermesh-clusters.json" 2>/dev/null || echo "[]") + _target_role="${CL2_NODE_CHURN_TARGET_CONTEXT}" + # Map role → AKS name + RG. Our tfvars set aks_name == role-derived + # name (e.g., role=mesh-1 → name=clustermesh-1), and `az aks + # get-credentials` writes kubectl context = AKS name. So + # CL2_NODE_CHURN_TARGET_CONTEXT is the AKS cluster name. + _target_row=$(echo "$_all" | jq -c --arg n "$_target_role" '.[] | select(.name==$n)') + if [ -z "$_target_row" ]; then + # Fallback: maybe the user set NODE_CHURN_TARGET_CONTEXT to a role. + _target_row=$(echo "$_all" | jq -c --arg r "$_target_role" '.[] | select(.role==$r)') + fi + if [ -z "$_target_row" ]; then + echo "##vso[task.logissue type=warning;] node-churner: target cluster '${_target_role}' not found in discovered clusters; skipping scenario stimulus" + NODE_CHURNER_PID="" + return 0 fi + local _target_name _target_rg _target_role_field _target_kubeconfig + _target_name=$(echo "$_target_row" | jq -r '.name') + _target_rg=$(echo "$_target_row" | jq -r '.rg') + _target_role_field=$(echo "$_target_row" | jq -r '.role') + _target_kubeconfig=$(echo "$_target_row" | jq -r '.kubeconfig // ""') + + # Per-scenario expected duration (matches the CL2 sleep window). + local _expected_dur + case "$_scen" in + node-churn-scale) _expected_dur="$CL2_NODE_CHURN_SCALE_DURATION_SECONDS" ;; + node-churn-replace) _expected_dur="$CL2_NODE_CHURN_REPLACE_DURATION_SECONDS" ;; + node-churn-combined) _expected_dur="$CL2_NODE_CHURN_COMBINED_DURATION_SECONDS" ;; + *) _expected_dur=1500 ;; + esac + + # Clear sentinels for THIS scenario so the prior scenario's + # leftovers (if any) don't pre-trigger the barrier. + rm -f "$SENTINEL_DIR"/ready-* 2>/dev/null || true - if [ "$cl2_passed" -eq 1 ]; then - echo " $role: CL2 run succeeded" + # Target report dir for NodeChurnTimings_*.json. Pre-create so + # node-churner.sh can write even before CL2 finishes for that + # cluster (CL2 lazy-creates report dirs). + local _target_report_dir="${_report_dir_base}/${_target_role_field}" + mkdir -p "$_target_report_dir" + + local _churner_log="${_target_report_dir}/node-churner.log" + echo "===== node-churner launch: scenario=${_scen} target=${_target_name} rg=${_target_rg} =====" | tee -a "$_churner_log" + + # Background subshell. The churner's EXIT trap restores the pool to + # original count regardless of how the script exits; finalizer + # outcome (cleanup_failed) lands in the timing JSON. + ( + bash "$NODE_CHURNER_SCRIPT" \ + "$_scen" \ + "$_target_name" \ + "$_target_rg" \ + "default" \ + "$_target_report_dir" \ + "$SENTINEL_DIR" \ + "$cluster_count" \ + "$CL2_NODE_CHURN_CYCLES" \ + "$CL2_NODE_CHURN_DELTA" \ + "$CL2_NODE_CHURN_SETTLE_SECONDS" \ + "$CL2_NODE_REPLACE_BATCH_SIZE" \ + "$CL2_NODE_CHURN_READY_TIMEOUT_SECONDS" \ + "$_expected_dur" \ + "$_target_kubeconfig" 2>&1 | tee -a "$_churner_log" + ) & + NODE_CHURNER_PID=$! + echo "node-churner: launched PID=$NODE_CHURNER_PID for scenario=${_scen}; log=${_churner_log}" + } + + # Wait helper — caller invokes after execute-parallel returns. + wait_node_churner() { + local _scen="$1" + if [ -z "${NODE_CHURNER_PID:-}" ]; then + return 0 fi + echo "node-churner: waiting on PID=$NODE_CHURNER_PID for scenario=${_scen}" + local _rc=0 + wait "$NODE_CHURNER_PID" || _rc=$? + if [ "$_rc" -ne 0 ]; then + echo "##vso[task.logissue type=warning;] node-churner: scenario=${_scen} exited rc=${_rc}; check NodeChurnTimings_*.json for scenario_valid / cleanup_failed flags" + fi + NODE_CHURNER_PID="" + return 0 + } + + # Proactive failure-time debug dump — runs after every scenario + # (always for node-churn; on rc!=0 for others). Writes diagnostic + # state to /_debug/scenario-diag-.log so + # postmortem doesn't depend on AzDO retaining stdout. Captures: + # - per-cluster `kubectl get nodes` (Ready state, IPs) + # - per-cluster `kubectl -n kube-system get pods` (mesh + workload pods) + # - per-cluster `cilium clustermesh status` (mesh health) + # - clusters JSON snapshot + # - share-infra meta snapshot + # - node-churner.log + NodeChurnTimings_*.json contents (for node-churn) + # User direction 2026-05-14: assume failure; keep this dump baked + # in until end-to-end node-churn is green. + scenario_failure_diag() { + local _scen="$1" _rc="${2:-0}" + local _diag_dir="${CL2_REPORT_DIR}/_debug" + mkdir -p "$_diag_dir" + local _diag_log="${_diag_dir}/scenario-diag-${_scen}.log" + # Read augmented clusters JSON (has kubeconfig field) — the shell + # `$clusters` var earlier in this script is the EARLY discovery + # output WITHOUT kubeconfig. Build 67126 regression: using + # `$clusters` here caused _kc=null → kubectl context errors. + local _clusters_with_kc + _clusters_with_kc=$(cat "$HOME/.kube/clustermesh-clusters.json" 2>/dev/null || echo "[]") + { + echo "================================================================" + echo "=== scenario-failure-diag: scenario=${_scen} rc=${_rc}" + echo "=== timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" + echo "================================================================" + echo "" + echo "-- clusters JSON (kubeconfig-augmented) --" + echo "$_clusters_with_kc" | jq . 2>&1 || echo "$_clusters_with_kc" + echo "" + if [ -f "${SHARE_INFRA_META:-/nonexistent}" ]; then + echo "-- share-infra meta --" + jq . "$SHARE_INFRA_META" 2>&1 || cat "$SHARE_INFRA_META" + echo "" + fi + echo "-- per-cluster state --" + for _row in $(echo "$_clusters_with_kc" | jq -c '.[]'); do + local _role _name _kc + _role=$(echo "$_row" | jq -r '.role') + _name=$(echo "$_row" | jq -r '.name') + _kc=$(echo "$_row" | jq -r '.kubeconfig') + echo "--- cluster ${_role} (${_name}, kubeconfig=${_kc}) ---" + if [ ! -f "$_kc" ]; then + echo "(kubeconfig file missing: ${_kc})" + continue + fi + echo "-- nodes --" + KUBECONFIG="$_kc" kubectl --context "$_name" get nodes -o wide 2>&1 | head -40 || echo "(kubectl get nodes failed)" + echo "-- nodes providerID --" + KUBECONFIG="$_kc" kubectl --context "$_name" get nodes \ + -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.spec.providerID}{"\n"}{end}' 2>&1 | head -40 || true + echo "-- kube-system pods (clustermesh/cilium) --" + KUBECONFIG="$_kc" kubectl --context "$_name" -n kube-system get pods \ + -l 'k8s-app in (clustermesh-apiserver,cilium)' -o wide 2>&1 | head -20 || true + echo "-- recent kube-system events --" + KUBECONFIG="$_kc" kubectl --context "$_name" -n kube-system get events \ + --sort-by=.lastTimestamp 2>&1 | tail -20 || true + echo "" + done + echo "-- sentinel dir contents (${SENTINEL_DIR:-unset}) --" + ls -la "${SENTINEL_DIR:-/nonexistent}" 2>&1 || echo "(sentinel dir missing)" + echo "" + if is_node_churn_scenario "$_scen"; then + echo "-- node-churn timing files + logs --" + find "${CL2_REPORT_DIR}/${_scen}" -name 'NodeChurnTimings_*.json' \ + -o -name 'node-churner*.log' 2>/dev/null | while IFS= read -r _f; do + echo "--- ${_f} ---" + cat "$_f" 2>&1 || true + echo "" + done + fi + if is_upper_bound_scenario "$_scen"; then + echo "-- upper-bound scenario state --" + echo "-- CL2_SATURATION_* env (as passed into CL2) --" + env | grep -E '^CL2_SATURATION_' 2>&1 || echo "(no CL2_SATURATION_* env vars)" + echo "" + echo "-- rendered overrides.yaml (CL2 sees this — verifies scale.py configure landed the saturation knobs) --" + if [ -f "${CL2_CONFIG_DIR}/overrides.yaml" ]; then + grep -E '^CL2_(SATURATION|NAMESPACES|DEPLOYMENTS|REPLICAS)' "${CL2_CONFIG_DIR}/overrides.yaml" 2>&1 || true + else + echo "(${CL2_CONFIG_DIR}/overrides.yaml does not exist)" + fi + echo "" + # Per-cluster: which rung measurement files made it to disk? + # If a rung is missing entirely, the classifier flags rung_completed=false; + # this dump tells postmortem WHY (e.g. CL2 timed out mid-rung, + # Prometheus pod was Pending, restart-burst hung). + for _row in $(echo "$_clusters_with_kc" | jq -c '.[]'); do + local _role _name _kc + _role=$(echo "$_row" | jq -r '.role') + _name=$(echo "$_row" | jq -r '.name') + _kc=$(echo "$_row" | jq -r '.kubeconfig') + # Single-scenario mode: report dir is /. + # Share-infra mode: //. Try both. + local _report_dir="${CL2_REPORT_DIR}/${_scen}/${_role}" + if [ ! -d "$_report_dir" ]; then + _report_dir="${CL2_REPORT_DIR}/${_role}" + fi + echo "--- cluster ${_role} (${_name}) report dir: ${_report_dir} ---" + echo "-- per-rung measurement file counts --" + for _rung in 0 1 2 3 4 5 6 7; do + # CL2 emits filenames like "GenericPrometheusQuery Rung__.json" + # with a SPACE between method and metric name (build 67211 verified). + # Match both space and legacy underscore conventions via "GenericPrometheusQuery*". + local _count + _count=$(find "${_report_dir}" -maxdepth 1 -name "GenericPrometheusQuery*Rung${_rung}_*.json" 2>/dev/null | wc -l) + if [ "$_count" -gt 0 ]; then + echo " Rung${_rung}: ${_count} measurement files" + fi + done + echo "-- junit.xml (CL2 phase pass/fail per rung) --" + if [ -f "${_report_dir}/junit.xml" ]; then + head -200 "${_report_dir}/junit.xml" 2>&1 || true + else + echo "(no junit.xml — CL2 likely failed before gathering measurements)" + fi + echo "-- monitoring/prometheus pod status (saturation can OOM Prom) --" + if [ -f "$_kc" ]; then + KUBECONFIG="$_kc" kubectl --context "$_name" -n monitoring get pods \ + -o wide 2>&1 | head -20 || echo "(kubectl get pods -n monitoring failed)" + echo "-- clustermesh-apiserver pod resource state (OOM/Restart signals) --" + KUBECONFIG="$_kc" kubectl --context "$_name" -n kube-system describe pod \ + -l 'k8s-app=clustermesh-apiserver' 2>&1 \ + | grep -E 'OOMKilled|Last State|Restart Count|Ready:' \ + | head -30 || true + else + echo "(kubeconfig missing: ${_kc})" + fi + echo "" + done + fi + echo "=== end scenario-failure-diag ===" + } 2>&1 | tee -a "$_diag_log" + echo "scenario-failure-diag: wrote ${_diag_log}" + } + + if [ -n "${SHARE_INFRA_SCENARIOS:-}" ]; then + # Trim whitespace from each entry, split on comma. + IFS=',' read -ra SCENARIO_LIST <<< "$SHARE_INFRA_SCENARIOS" + for i in "${!SCENARIO_LIST[@]}"; do + SCENARIO_LIST[$i]="$(echo "${SCENARIO_LIST[$i]}" | xargs)" + done - # Always-on log capture (spec line 35: "Logs: clustermesh-apiserver, - # agent watchers"). Files land in $report_dir/logs/ so they are - # uploaded alongside junit.xml + measurement results when the - # publish step runs. The same files double as immediate - # diagnostics for failed runs (see FAILURE DIAG block below). - log_dir="$report_dir/logs" - mkdir -p "$log_dir" - echo "------- $role: capturing pod logs to $log_dir -------" - # clustermesh-apiserver: all three containers (apiserver / etcd / - # kvstoremesh) — bounded tail, single pod expected. - for c in apiserver etcd kvstoremesh; do - KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ - -l k8s-app=clustermesh-apiserver -c "$c" --tail=4000 \ - > "$log_dir/clustermesh-apiserver-$c.log" 2>&1 || true + # Pre-validate: non-empty, all referenced config files exist, no + # blanks (catches trailing commas, whitespace-only entries). + if [ "${#SCENARIO_LIST[@]}" -eq 0 ]; then + echo "##vso[task.logissue type=error;] SHARE_INFRA_SCENARIOS is set but empty after split" + exit 1 + fi + for s in "${SCENARIO_LIST[@]}"; do + if [ -z "$s" ]; then + echo "##vso[task.logissue type=error;] SHARE_INFRA_SCENARIOS contains empty entry; got: '$SHARE_INFRA_SCENARIOS'" + exit 1 + fi + if [ ! -f "${CL2_CONFIG_DIR}/${s}.yaml" ]; then + echo "##vso[task.logissue type=error;] CL2 config file not found: ${CL2_CONFIG_DIR}/${s}.yaml (from SHARE_INFRA_SCENARIOS=$SHARE_INFRA_SCENARIOS)" + exit 1 + fi done - # cilium-agent: one pod per node — keep tail small to bound size. - KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ - -l k8s-app=cilium --tail=1000 --prefix=true \ - > "$log_dir/cilium-agent.log" 2>&1 || true - # cilium-operator: low-volume control plane. - KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ - -l io.cilium/app=operator --tail=2000 --prefix=true \ - > "$log_dir/cilium-operator.log" 2>&1 || true - - if [ "$cl2_passed" -ne 1 ]; then - # Dump enough state to distinguish prometheus-stack scheduling - # failures from CL2 logic failures. Prometheus is the most common - # culprit here — its pod requests 10Gi by default, doesn't fit on - # Standard_D4s_v4. If the pod is Pending with FailedScheduling, the - # describe events make that obvious. + + # Persist the validated scenario list + per-scenario start timestamps + # for downstream collect.yml. Written to the kubeconfig dir alongside + # clustermesh-clusters.json so it's deterministically discoverable. + SHARE_INFRA_META="$HOME/.kube/share-infra-meta.json" + echo "[]" > "$SHARE_INFRA_META" + + echo "=============================================" + echo "Share-infra mode: ${#SCENARIO_LIST[@]} scenarios in this lifecycle: ${SCENARIO_LIST[*]}" + echo "=============================================" + + for i in "${!SCENARIO_LIST[@]}"; do + SCENARIO="${SCENARIO_LIST[$i]}" + scenario_idx=$((i + 1)) + echo "=============================================" + echo "Scenario [${scenario_idx}/${#SCENARIO_LIST[@]}]: ${SCENARIO}" + echo "=============================================" + scenario_start=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + jq --arg name "$SCENARIO" --arg start "$scenario_start" \ + '. += [{"scenario": $name, "start_timestamp": $start}]' \ + "$SHARE_INFRA_META" > "${SHARE_INFRA_META}.tmp" && mv "${SHARE_INFRA_META}.tmp" "$SHARE_INFRA_META" + + # Per-scenario report dir so collect.yml can iterate per-scenario. + # tear_down_prometheus=True so each scenario gets a clean Prom deploy + # (rather than colliding with the previous scenario's leftover + # PodMonitor + scrape config). # - # Note: scale.py passes tear_down_prometheus=False so the stack - # survives this dump (otherwise CL2 would clean up before we look). - echo "------- $role: CL2 FAILURE DIAG -------" - echo "------- node allocatable / requested capacity -------" - KUBECONFIG="$kubeconfig" kubectl get nodes -o wide 2>&1 || true - KUBECONFIG="$kubeconfig" kubectl describe nodes 2>&1 | grep -A 4 "Allocatable\|Allocated resources" | head -40 || true + # Per-scenario max_concurrent override: + # The isolation scenario REQUIRES every peer cluster's Prometheus + # window to overlap the target's 10min churn window — otherwise + # peers in later batches start CL2 AFTER target's churn has + # ended and produce useless rows for the A/B. Bump concurrency to + # mesh_size (== cluster_count) for isolation. Safe at n=20 because + # peers SLEEP during the kill window — 1 heavy container + 19 + # idle ones easily fits the agent. Same override applies to + # node-churn-* scenarios: node-churner.sh's ready-sentinel + # barrier requires every cluster's CL2 to be running before the + # first nodepool op fires. + if needs_mesh_wide_concurrency "$SCENARIO"; then + EFFECTIVE_MAX_CONCURRENT="${cluster_count}" + echo "Scenario ${SCENARIO}: overriding max_concurrent ${CL2_MAX_CONCURRENT:-4} → ${EFFECTIVE_MAX_CONCURRENT} (mesh-wide observation required)" + else + EFFECTIVE_MAX_CONCURRENT="${CL2_MAX_CONCURRENT:-4}" + fi + # Launch the host-side stimulus driver for node-churn-* scenarios + # BEFORE execute-parallel so the churner is ready to consume CL2 + # sentinels as soon as the per-cluster CL2 containers start. + NODE_CHURNER_PID="" + if is_node_churn_scenario "$SCENARIO"; then + launch_node_churner "$SCENARIO" "${CL2_REPORT_DIR}/${SCENARIO}" + fi + scenario_rc=0 + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute-parallel \ + --clusters "$HOME/.kube/clustermesh-clusters.json" \ + --max-concurrent "${EFFECTIVE_MAX_CONCURRENT}" \ + --worker-script "$WORKER_SCRIPT" \ + --cl2-image "${CL2_IMAGE}" \ + --cl2-config-dir "${CL2_CONFIG_DIR}" \ + --cl2-config-file "${SCENARIO}.yaml" \ + --cl2-report-dir-base "${CL2_REPORT_DIR}/${SCENARIO}" \ + --provider "${CLOUD}" \ + --python-script-file "$PYTHON_SCRIPT_FILE" \ + --python-workdir "$(pwd)" \ + --tear-down-prometheus || scenario_rc=$? - echo "------- monitoring/* pods -------" - KUBECONFIG="$kubeconfig" kubectl -n monitoring get pods -o wide 2>&1 || true + # Join node-churner BEFORE finalizing scenario_rc — the churner's + # finalizer must complete (pool restored to original count) before + # the next scenario starts, otherwise the next CL2 invocation + # could run against an in-flux topology. + wait_node_churner "$SCENARIO" - echo "------- monitoring statefulsets -------" - KUBECONFIG="$kubeconfig" kubectl -n monitoring get statefulset -o wide 2>&1 || true + # Proactive failure debug dump (added 2026-05-14 after build 67114). + # User direction: assume failure, keep debug logs persistent across + # runs; remove only after green. Runs unconditionally for node-churn + # AND upper-bound scenarios (both have rich state worth dumping + # whether or not CL2 succeeded); runs only on rc!=0 for others. + if is_node_churn_scenario "$SCENARIO" || is_upper_bound_scenario "$SCENARIO" || [ "$scenario_rc" -ne 0 ]; then + scenario_failure_diag "$SCENARIO" "$scenario_rc" + fi - echo "------- Prometheus CR (operator input) -------" - KUBECONFIG="$kubeconfig" kubectl -n monitoring get prometheus -o yaml 2>&1 | head -80 || true + # Treat finalizer cleanup_failed as a hard fail of the share-infra + # loop — running additional scenarios against a half-scaled cluster + # would contaminate their data. + if is_node_churn_scenario "$SCENARIO"; then + _churner_timing_file="${CL2_REPORT_DIR}/${SCENARIO}/${CL2_NODE_CHURN_TARGET_CONTEXT}/NodeChurnTimings_${CL2_NODE_CHURN_TARGET_CONTEXT}.json" + if [ -f "$_churner_timing_file" ]; then + _cleanup_failed=$(jq -r '.cleanup_failed // false' "$_churner_timing_file") + if [ "$_cleanup_failed" = "true" ]; then + echo "##vso[task.logissue type=error;] node-churner finalizer FAILED for ${SCENARIO}; aborting remaining share-infra scenarios to avoid contaminating their data on a half-scaled cluster" + overall_rc=1 + break + fi + fi + fi - echo "------- prometheus-k8s pod describe -------" - KUBECONFIG="$kubeconfig" kubectl -n monitoring describe pod -l app.kubernetes.io/name=prometheus 2>&1 | tail -60 || true + if [ "$scenario_rc" -ne 0 ]; then + echo "##vso[task.logissue type=warning;] Scenario ${SCENARIO} exited rc=${scenario_rc}; subsequent scenarios will continue but the step's final exit reflects this failure" + overall_rc=$scenario_rc + fi - echo "------- prometheus-operator logs (tail 60) -------" - KUBECONFIG="$kubeconfig" kubectl -n monitoring logs -l app.kubernetes.io/name=prometheus-operator --tail=60 2>&1 || true + # Settle between scenarios — gives Cilium time to GC stale + # identities/endpoints/services from the previous scenario before + # the next scenario's measurement window begins. Last scenario + # skips the settle. + if [ "$scenario_idx" -lt "${#SCENARIO_LIST[@]}" ]; then + echo "Settle 60s between scenarios (kvstore GC + identity slot cooldown)..." + sleep 60 + fi + done - echo "------- monitoring namespace events (recent) -------" - KUBECONFIG="$kubeconfig" kubectl -n monitoring get events --sort-by='.lastTimestamp' 2>&1 | tail -30 || true - echo "------- end CL2 FAILURE DIAG -------" + # Make the meta file available to collect.yml via a step variable — + # written as task.setvariable so the next step in the same job picks it up. + echo "##vso[task.setvariable variable=SHARE_INFRA_META]$SHARE_INFRA_META" - echo "##vso[task.logissue type=warning;] $role: CL2 run failed (junit missing or has failures/errors at $report_dir/junit.xml; continuing other clusters)" - failures=$((failures + 1)) + echo "=============================================" + echo "Share-infra summary: ${#SCENARIO_LIST[@]} scenarios processed, overall_rc=${overall_rc}" + echo "=============================================" + # Phase 4b: do NOT exit with non-zero on per-scenario failure. + # If we did, AzDO's default succeeded() gate on subsequent steps + # (collect + upload + destroy) would SKIP them and we'd lose ALL + # data even when most scenarios succeeded. Instead, emit + # `task.complete result=SucceededWithIssues` so the step shows + # orange in the AzDO UI (not green, not red) while still allowing + # downstream steps to run. Per-scenario failures remain visible + # via the ##vso[task.logissue type=warning] lines emitted in the + # loop above; per-row failures are also queryable in Kusto via + # the status column. + # + # Genuinely catastrophic failures (validation errors above this + # block) still exit 1 — those happen BEFORE any data is gathered + # so skipping downstream is the right call. + if [ "$overall_rc" -ne 0 ]; then + echo "##vso[task.complete result=SucceededWithIssues;]" fi - done + exit 0 + fi - if [ "$failures" -gt 0 ]; then - echo "##vso[task.logissue type=error;] CL2 failed on $failures cluster(s)" - exit 1 + # Single-scenario path (default, unchanged from Phase 4a — prod pipeline + # relies on this). + # + # Bounded-parallel CL2 fan-out across clusters. Each worker invokes + # run-cl2-on-cluster.sh — same per-cluster body the bash for-loop used + # to run sequentially (CL2 invoke + junit gate + log capture + failure + # diag), now with bounded concurrency. CL2_MAX_CONCURRENT defaults to 4 + # at the matrix level (event-throughput.yaml); smaller tiers can lower + # it to 1 to recover sequential behavior if needed. + # + # Same per-scenario override as the share-infra loop above: isolation + # and node-churn-* need mesh-wide concurrent observation. + SINGLE_SCENARIO_BASENAME="${CL2_CONFIG_FILE%.yaml}" + if needs_mesh_wide_concurrency "$SINGLE_SCENARIO_BASENAME"; then + EFFECTIVE_MAX_CONCURRENT="${cluster_count}" + echo "Single-scenario ${SINGLE_SCENARIO_BASENAME}: overriding max_concurrent ${CL2_MAX_CONCURRENT:-4} → ${EFFECTIVE_MAX_CONCURRENT} (mesh-wide observation required)" + else + EFFECTIVE_MAX_CONCURRENT="${CL2_MAX_CONCURRENT:-4}" + fi + # Launch host-side stimulus for node-churn-* in single-scenario mode. + NODE_CHURNER_PID="" + if is_node_churn_scenario "$SINGLE_SCENARIO_BASENAME"; then + launch_node_churner "$SINGLE_SCENARIO_BASENAME" "${CL2_REPORT_DIR}" + fi + single_scenario_rc=0 + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute-parallel \ + --clusters "$HOME/.kube/clustermesh-clusters.json" \ + --max-concurrent "${EFFECTIVE_MAX_CONCURRENT}" \ + --worker-script "$WORKER_SCRIPT" \ + --cl2-image "${CL2_IMAGE}" \ + --cl2-config-dir "${CL2_CONFIG_DIR}" \ + --cl2-config-file "${CL2_CONFIG_FILE}" \ + --cl2-report-dir-base "${CL2_REPORT_DIR}" \ + --provider "${CLOUD}" \ + --python-script-file "$PYTHON_SCRIPT_FILE" \ + --python-workdir "$(pwd)" || single_scenario_rc=$? + wait_node_churner "$SINGLE_SCENARIO_BASENAME" + # Proactive failure debug dump for single-scenario mode too. Run + # unconditionally for node-churn AND upper-bound (rich state worth + # dumping regardless of success); rc!=0 for everything else. + if is_node_churn_scenario "$SINGLE_SCENARIO_BASENAME" || is_upper_bound_scenario "$SINGLE_SCENARIO_BASENAME" || [ "$single_scenario_rc" -ne 0 ]; then + scenario_failure_diag "$SINGLE_SCENARIO_BASENAME" "$single_scenario_rc" fi + # In single-scenario prod mode we DON'T have a share-infra loop to + # break out of, but we still want the AzDO step to surface non-zero + # rc on CL2 failure (prod's existing contract). The churner-finalizer + # cleanup_failed state is logged via the timing JSON (Kusto-visible); + # we don't promote it to step failure here because terraform destroy + # will tear down the cluster regardless. + exit $single_scenario_rc workingDirectory: modules/python env: ${{ if eq(parameters.cloud, 'azure') }}: @@ -198,9 +739,11 @@ steps: CLOUD: ${{ parameters.cloud }} REGION: ${{ parameters.region }} PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/scale.py + WORKER_SCRIPT: $(Pipeline.Workspace)/s/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh CL2_IMAGE: ${{ parameters.engine_input.image }} CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/config CL2_CONFIG_FILE: $(cl2_config_file) CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/results CL2_OPERATION_TIMEOUT: ${{ parameters.engine_input.operation_timeout }} + NODE_CHURNER_SCRIPT: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh displayName: "Run CL2 across all clustermesh clusters" diff --git a/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh b/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh new file mode 100755 index 0000000000..c20a66f0f6 --- /dev/null +++ b/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +# Per-cluster CL2 worker for the clustermesh-scale scenario. +# +# Extracted from steps/engine/clusterloader2/clustermesh-scale/execute.yml +# so that scale.py execute-parallel can fan out N copies of this script with +# bounded concurrency. The body MUST stay equivalent to the original +# per-iteration bash for-loop body (CL2 invoke + junit check + log capture + +# failure diag) — see PR #1157 phase 3 for the parallelization rationale. +# +# Per-cluster log capture + failure diag happen IMMEDIATELY when this +# cluster's CL2 finishes — before peer clusters complete — so that +# `kubectl --tail` log windows and `kubectl get events` recency don't age out +# while peers are still running. +# +# Exit code: +# 0 — CL2 ran AND junit.xml reports failures=0 errors=0 +# 1 — anything else (CL2 didn't write junit, or junit has failures/errors) +# This is the authoritative per-cluster pass/fail signal that +# scale.py execute-parallel aggregates into the step's exit code. +# +# Usage: +# run-cl2-on-cluster.sh \ +# \ +# \ +# \ +# [tear_down_prometheus_flag] +# +# tear_down_prometheus_flag: "1" → pass --tear-down-prometheus to scale.py +# execute. Used by share-infra mode so each scenario's CL2 deploys a fresh +# Prom. "0" or unset → preserve Prom for failure-diagnostic dump (default +# single-scenario behavior). + +set -uo pipefail + +if [ "$#" -lt 9 ] || [ "$#" -gt 10 ]; then + echo "Usage: $0 [tear_down_prometheus_flag]" >&2 + exit 2 +fi + +role="$1" +kubeconfig="$2" +report_dir="$3" +cl2_image="$4" +cl2_config_dir="$5" +cl2_config_file="$6" +provider="$7" +python_script_file="$8" +python_workdir="$9" +tear_down_prometheus_flag="${10:-0}" + +mkdir -p "$report_dir" + +echo "====================================================================" +echo " Running CL2 on $role" +echo "====================================================================" + +cl2_passed=0 +# Run CL2; collect outcome WITHOUT failing on a non-zero exit (so we can +# also inspect junit.xml for internal test failures even when CL2 exits +# 0). Treat as "passed" only if BOTH: +# (a) junit.xml exists (CL2 actually completed and wrote a report) +# (b) junit.xml has zero / elements +# Without (b) we'd silently green-light runs where measurements failed +# — e.g. PodMonitor template substitution producing "", which +# k8s admission rejects but CL2 still writes junit with tags. +exec_extra_args=() +if [ "$tear_down_prometheus_flag" = "1" ]; then + exec_extra_args+=(--tear-down-prometheus) +fi +( + cd "$python_workdir" || exit 1 + PYTHONPATH="${PYTHONPATH:-}:$python_workdir" python3 -u "$python_script_file" execute \ + --cl2-image "$cl2_image" \ + --cl2-config-dir "$cl2_config_dir" \ + --cl2-report-dir "$report_dir" \ + --cl2-config-file "$cl2_config_file" \ + --kubeconfig "$kubeconfig" \ + --provider "$provider" \ + "${exec_extra_args[@]}" +) || true + +if [ -f "$report_dir/junit.xml" ]; then + # Count failure/error attrs from . + junit_failures=$(grep -oE 'failures="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0) + junit_errors=$(grep -oE 'errors="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0) + junit_failures=${junit_failures:-0} + junit_errors=${junit_errors:-0} + if [ "$junit_failures" -eq 0 ] && [ "$junit_errors" -eq 0 ]; then + cl2_passed=1 + else + echo "##vso[task.logissue type=warning;] $role: junit.xml reports failures=$junit_failures errors=$junit_errors" + fi +fi + +if [ "$cl2_passed" -eq 1 ]; then + echo " $role: CL2 run succeeded" +fi + +# Always-on log capture (spec line 35: "Logs: clustermesh-apiserver, +# agent watchers"). Files land in $report_dir/logs/ so they are +# uploaded alongside junit.xml + measurement results when the +# publish step runs. Capturing PER CLUSTER as soon as that cluster's CL2 +# finishes is important under parallel fan-out: if we waited until all +# peers completed, --tail windows and recent-events queries would age out +# diagnostic data on the cluster that finished first. +log_dir="$report_dir/logs" +mkdir -p "$log_dir" +echo "------- $role: capturing pod logs to $log_dir -------" +# clustermesh-apiserver: all three containers (apiserver / etcd / +# kvstoremesh) — bounded tail, single pod expected. +for c in apiserver etcd kvstoremesh; do + KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ + -l k8s-app=clustermesh-apiserver -c "$c" --tail=4000 \ + > "$log_dir/clustermesh-apiserver-$c.log" 2>&1 || true +done +# cilium-agent: one pod per node — keep tail small to bound size. +KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ + -l k8s-app=cilium --tail=1000 --prefix=true \ + > "$log_dir/cilium-agent.log" 2>&1 || true +# cilium-operator: low-volume control plane. +KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ + -l io.cilium/app=operator --tail=2000 --prefix=true \ + > "$log_dir/cilium-operator.log" 2>&1 || true + +if [ "$cl2_passed" -ne 1 ]; then + # Dump enough state to distinguish prometheus-stack scheduling + # failures from CL2 logic failures. Prometheus is the most common + # culprit here — its pod requests 10Gi by default, doesn't fit on + # Standard_D4s_v4. If the pod is Pending with FailedScheduling, the + # describe events make that obvious. + # + # Note: scale.py passes tear_down_prometheus=False so the stack + # survives this dump (otherwise CL2 would clean up before we look). + echo "------- $role: CL2 FAILURE DIAG -------" + echo "------- node allocatable / requested capacity -------" + KUBECONFIG="$kubeconfig" kubectl get nodes -o wide 2>&1 || true + KUBECONFIG="$kubeconfig" kubectl describe nodes 2>&1 | grep -A 4 "Allocatable\|Allocated resources" | head -40 || true + + echo "------- monitoring/* pods -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get pods -o wide 2>&1 || true + + echo "------- monitoring statefulsets -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get statefulset -o wide 2>&1 || true + + echo "------- Prometheus CR (operator input) -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get prometheus -o yaml 2>&1 | head -80 || true + + echo "------- prometheus-k8s pod describe -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring describe pod -l app.kubernetes.io/name=prometheus 2>&1 | tail -60 || true + + echo "------- prometheus-operator logs (tail 60) -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring logs -l app.kubernetes.io/name=prometheus-operator --tail=60 2>&1 || true + + echo "------- monitoring namespace events (recent) -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get events --sort-by='.lastTimestamp' 2>&1 | tail -30 || true + echo "------- end CL2 FAILURE DIAG -------" + + echo "##vso[task.logissue type=warning;] $role: CL2 run failed (junit missing or has failures/errors at $report_dir/junit.xml)" + exit 1 +fi + +exit 0 diff --git a/steps/topology/clustermesh-scale/validate-resources.yml b/steps/topology/clustermesh-scale/validate-resources.yml index bfd47a11c6..6f51411cb9 100644 --- a/steps/topology/clustermesh-scale/validate-resources.yml +++ b/steps/topology/clustermesh-scale/validate-resources.yml @@ -44,6 +44,90 @@ steps: echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$count" displayName: "Enumerate clustermesh clusters" + # ---------------------------------------------------------------------------- + # Pre-gate: wait for every cluster's clustermesh-apiserver Deployment to be + # Available AND its Service to have an external LoadBalancer IP, in parallel. + # + # Why this step exists + # -------------------- + # Fleet's ClusterMeshProfile reconciler only pushes a peer's kubeconfig into + # other clusters' apiserver configs once that peer's LB has an external IP. + # If we start the per-cluster peering loop below before every cluster's LB + # is up, the X/Y readout in `cilium-dbg status` stalls at "Y < N-1" — Fleet + # has only pushed the kubeconfigs for the subset of peers that ARE LB-ready, + # and bumping the retry budget in the loop doesn't help because the missing + # peer kubeconfigs will never arrive while their LBs are still pending. + # + # Empirically at N=20, ~25% of clustermesh-apiserver LBs are still pending + # IP assignment when terraform apply returns success, because Azure LB + # provisioning happens asynchronously after Service creation. Per-cluster + # budget is 30 min — longer than any LB tail we've observed. + # ---------------------------------------------------------------------------- + - script: | + set -euo pipefail + set -x + + clusters=$(cat "$HOME/.kube/clustermesh-clusters.json") + cluster_count=$(echo "$clusters" | jq 'length') + + # Sequential kubeconfig fetch — parallel `az aks get-credentials` + # writes race on the shared ~/.azure MSAL token cache (same reason + # execute.yml pre-fetches kubeconfigs sequentially). + for row in $(echo "$clusters" | jq -c '.[]'); do + name=$(echo "$row" | jq -r '.name') + rg=$(echo "$row" | jq -r '.rg') + role=$(echo "$row" | jq -r '.role') + kc="$HOME/.kube/$role.config" + KUBECONFIG="$kc" az aks get-credentials \ + --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors + done + + # Parallel poll for clustermesh-apiserver readiness on every cluster. + # Each subshell gets a 30-min budget; we collect failures rather than + # fail-fast on the first one so the operator sees the full set of + # slow LBs in one shot instead of one cluster at a time. + pids=() + roles=() + for row in $(echo "$clusters" | jq -c '.[]'); do + role=$(echo "$row" | jq -r '.role') + ( + kc="$HOME/.kube/$role.config" + deadline=$(( $(date +%s) + 1800 )) + last_state="" + while [ "$(date +%s)" -lt "$deadline" ]; do + avail=$(KUBECONFIG="$kc" kubectl -n kube-system get deployment clustermesh-apiserver \ + -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null || true) + ip=$(KUBECONFIG="$kc" kubectl -n kube-system get svc clustermesh-apiserver \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [ "$avail" = "True" ] && [ -n "$ip" ]; then + echo "[$role] OK (deployment=Available, LB IP=$ip)" + exit 0 + fi + last_state="deployment=${avail:-}, LB=${ip:-}" + sleep 15 + done + echo "[$role] FAIL: clustermesh-apiserver not ready within 30 min ($last_state)" >&2 + exit 1 + ) & + pids+=("$!") + roles+=("$role") + done + + failed=0 + for i in "${!pids[@]}"; do + if ! wait "${pids[$i]}"; then + echo "##vso[task.logissue type=error;] ${roles[$i]}: clustermesh-apiserver not ready within 30 min" + failed=$((failed + 1)) + fi + done + + if [ "$failed" -gt 0 ]; then + echo "##vso[task.logissue type=error;] $failed of $cluster_count clustermesh-apiserver(s) not ready; peering will not converge" + exit 1 + fi + echo "All $cluster_count clustermesh-apiserver Deployments+LBs ready; Fleet can now push peer configs" + displayName: "Wait for clustermesh-apiserver Deployments + LBs (parallel)" + - script: | set -euo pipefail set -x @@ -71,7 +155,39 @@ steps: echo "--- nodes ---" kubectl get nodes -o wide - kubectl wait --for=condition=Ready nodes --all --timeout=5m + # Wait until ALL nodes reach Ready. Originally a single + # `kubectl wait --timeout=5m` call, but a 5min hard timeout is + # brittle when 1-2 of N nodes flap NotReady transiently at + # startup (kubelet image pull, CNI sandbox init). Smoke build + # 67014 hit this — 2 of 21 nodes briefly NotReady, kubectl + # wait timed out, validate step failed, CL2 skipped (~30min + # of provisioned infra wasted). + # + # New behavior: retry-with-resample loop, 15min budget, 30s + # rechecks. Exits as soon as all nodes are Ready; gives a + # final diag dump on timeout (which clusters/nodes are still + # NotReady). + node_ready_deadline=$(( $(date +%s) + 900 )) + while true; do + if kubectl wait --for=condition=Ready nodes --all --timeout=30s >/dev/null 2>&1; then + echo "All nodes Ready" + break + fi + if [ "$(date +%s)" -ge "$node_ready_deadline" ]; then + echo "##vso[task.logissue type=error;] $role: node readiness timeout after 15 min" + echo "--- final node state ---" + kubectl get nodes -o wide || true + echo "--- NotReady nodes describe ---" + for n in $(kubectl get nodes --no-headers 2>/dev/null | awk '$2 != "Ready" {print $1}'); do + echo "--- $n ---" + kubectl describe node "$n" 2>&1 | head -50 || true + done + exit 1 + fi + not_ready=$(kubectl get nodes --no-headers 2>/dev/null | awk '$2 != "Ready"' | wc -l) + echo "$(date -u +%H:%M:%S): ${not_ready} node(s) NotReady, waiting (deadline at $(date -u -d @${node_ready_deadline} +%H:%M:%S))" + sleep 30 + done echo "--- cilium agent pods ---" kubectl -n kube-system get pods -l k8s-app=cilium -o wide @@ -112,8 +228,14 @@ steps: # "configured/connected" first because it counts apiserver clients, # while the in-pod view requires the Secret to be reloaded. We gate on # the in-pod view because the data path needs the agent's local state. + # Mesh convergence retry budget. At N=20 we observed mesh-2 + + # mesh-6 take ~24 min to reach 19/19 connected (initial-sync + Fleet + # member-secret reload at scale). Budget of 120 * 15s = 30 min + # accommodates that slowest-cluster tail. Smaller N (2/5/10) finish + # in <5 min and exit the loop early via the break, so no cost on + # green runs at small N. connected=0 - for i in $(seq 1 60); do + for i in $(seq 1 120); do out=$(kubectl -n kube-system exec ds/cilium -- cilium-dbg status 2>&1 || true) echo "$out" # Parse "/ remote clusters ready" line. @@ -124,56 +246,8 @@ steps: break fi - # ============== DEBUG-DUMP-BEGIN (REMOVE BEFORE MERGE) ============== - # Every 6 iterations dump richer state: in-pod cilium-cli view of the - # mesh, clustermesh-apiserver pod state, and Fleet-side member status. - # These help diagnose why convergence is stalling. Strip before final - # PR review. - if [ "$((i % 6))" -eq 0 ]; then - echo "------- [debug] retry $i: cilium clustermesh status (runner cli) -------" - cilium clustermesh status --context "$(kubectl config current-context)" --wait=false 2>&1 || true - - echo "------- [debug] retry $i: clustermesh-apiserver pods -------" - kubectl -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide 2>&1 || true - kubectl -n kube-system describe pods -l k8s-app=clustermesh-apiserver 2>&1 | tail -40 || true - - echo "------- [debug] retry $i: clustermesh-apiserver service -------" - # Service of type LoadBalancer for the clustermesh-apiserver. If - # EXTERNAL-IP stays "", the AKS control-plane identity is - # missing Network Contributor on the VNet (cloud-controller-manager - # cannot provision the internal LB). Look in describe events for - # AuthorizationFailed / forbidden messages. - kubectl -n kube-system get svc clustermesh-apiserver -o wide 2>&1 || true - kubectl -n kube-system describe svc clustermesh-apiserver 2>&1 | tail -25 || true - - echo "------- [debug] retry $i: cilium agent restarts / readiness -------" - kubectl -n kube-system get pods -l k8s-app=cilium -o wide 2>&1 || true - - echo "------- [debug] retry $i: Fleet ClusterMeshProfile profile-level status -------" - # Profile-level mesh state (NotConnected/Connecting/Connected/Failed) - # plus the last operation error if any. This is the authoritative - # control-plane view of whether the mesh has converged. - az fleet clustermeshprofile show \ - --resource-group "$rg" \ - --fleet-name clustermesh-flt \ - --name clustermesh-cmp \ - --query "{state:properties.status.state, provisioningState:properties.provisioningState, lastError:properties.status.lastOperationError}" \ - -o jsonc 2>&1 || true - - echo "------- [debug] retry $i: Fleet ClusterMeshProfile members (connection state) -------" - # Per-member: provisioningState is just ARM-level (join accepted); - # meshProperties.status.state is the actual Cilium connection state. - az fleet clustermeshprofile list-members \ - --resource-group "$rg" \ - --fleet-name clustermesh-flt \ - --name clustermesh-cmp \ - --query "[].{name:name, provisioning:properties.provisioningState, mesh:properties.meshProperties.status.state, lastUpdated:properties.meshProperties.status.lastUpdatedAt, error:properties.meshProperties.status.error.message}" \ - -o table 2>&1 || true - fi - # =============== DEBUG-DUMP-END (REMOVE BEFORE MERGE) =============== - - echo " waiting for $expected_remote remote clusters to be ready (got $ready), retry $i/60..." - sleep 10 + echo " waiting for $expected_remote remote clusters to be ready (got $ready), retry $i/120..." + sleep 15 done if [ "$connected" -ne 1 ]; then @@ -337,65 +411,6 @@ steps: done if [ "$ok" -ne 1 ]; then - # ============== SMOKE-FAILURE-DEBUG-DUMP (REMOVE BEFORE MERGE) ============== - # On failure, dump enough state to distinguish Cilium global-service - # sync issues from cross-VNet pod-IP routing issues. Specifically: - # 1. cilium clustermesh status — should show "Global services: 1" if sync OK - # 2. cilium service list (in-pod) — should have an entry for cm-smoke/echo - # with remote-cluster backends in cluster 2 - # 3. kubectl describe svc / get endpoints echo — k8s view (cluster 2 should - # have NO local endpoints, that's expected) - # 4. From inside the curl pod: DNS resolve, then direct-IP curl to a - # cluster-1 echo pod IP — bypasses ClusterIP, tests raw L3 across VNets - echo - echo "================ SMOKE FAILURE DIAG (cluster $first_role -- backend) ================" - KUBECONFIG="$kc_first" cilium clustermesh status --context "$(KUBECONFIG="$kc_first" kubectl config current-context)" --wait=false 2>&1 || true - KUBECONFIG="$kc_first" kubectl -n "$ns" describe svc echo 2>&1 || true - KUBECONFIG="$kc_first" kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true - KUBECONFIG="$kc_first" kubectl -n "$ns" get pods -l app=echo -o wide 2>&1 || true - echo "------- $first_role: cilium-config (clustermesh-relevant flags) -------" - # Authoritative source for whether the cilium agent is configured to - # process global services. Look for: enable-cluster-mesh, - # cluster-mesh-shared-services, clustermesh-config, identity-allocation-mode, - # enable-services. AKS/ACNS may gate global services with a feature flag. - KUBECONFIG="$kc_first" kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \ - | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true - echo "------- $first_role: cilium service list (full, head 40) -------" - KUBECONFIG="$kc_first" kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true - echo "------- $first_role: cilium-operator logs (tail 60) -------" - KUBECONFIG="$kc_first" kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \ - | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true - - echo - echo "================ SMOKE FAILURE DIAG (cluster $second_role -- client) ================" - KUBECONFIG="$kc_second" cilium clustermesh status --context "$(KUBECONFIG="$kc_second" kubectl config current-context)" --wait=false 2>&1 || true - KUBECONFIG="$kc_second" kubectl -n "$ns" describe svc echo 2>&1 || true - KUBECONFIG="$kc_second" kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true - echo "------- $second_role: cilium-config (clustermesh-relevant flags) -------" - KUBECONFIG="$kc_second" kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \ - | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true - echo "------- $second_role: cilium service list (full, head 40) -------" - KUBECONFIG="$kc_second" kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true - echo "------- $second_role: cilium-operator logs (tail 60) -------" - KUBECONFIG="$kc_second" kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \ - | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true - - echo - echo "------- DNS + direct-pod-IP probe from curl pod (bypass ClusterIP) -------" - # ClusterIP plumbing is a Cilium-clustermesh concern; direct pod-IP - # connectivity is a VNet-peering concern. Hitting a backend pod IP - # directly disambiguates the two failure modes. - KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- nslookup echo.cm-smoke.svc.cluster.local 2>&1 || true - backend_ip=$(KUBECONFIG="$kc_first" kubectl -n "$ns" get pod -l app=echo -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true) - echo "first cluster's echo pod IP: ${backend_ip:-}" - if [ -n "${backend_ip:-}" ]; then - KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- \ - curl -fsS -m 5 "http://${backend_ip}:8080/hostname" 2>&1 || \ - echo " direct pod-IP curl ALSO failed → cross-VNet routing issue (peering / pod-CIDR routes)" - fi - echo "============================ END SMOKE DIAG ============================" - # =========================== END SMOKE-FAILURE-DEBUG-DUMP =========================== - echo "##vso[task.logissue type=error;] Cross-cluster data-path smoke failed: $second_role could not reach service in $first_role" exit 1 fi