diff --git a/modules/python/clusterloader2/clustermesh-scale/config/annotate-namespaces.sh b/modules/python/clusterloader2/clustermesh-scale/config/annotate-namespaces.sh
new file mode 100755
index 0000000000..9c3fb1b5f3
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/annotate-namespaces.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Annotate workload namespaces for ACNS (managed Cilium) opt-in cross-cluster sync.
+#
+# AKS-managed Cilium ships with `clustermesh-default-global-namespace=false`
+# (opt-in mode, per ACNS team confirmation 2026-05-11 from David Vadas /
+# Isaiah Raya), unlike upstream Cilium which defaults to opt-out. Without
+# the `clustermesh.cilium.io/global: "true"` annotation on the workload
+# namespace, NONE of the namespace's resources (CiliumIdentity,
+# CiliumEndpoint, CiliumEndpointSlice, Services, ServiceExports) sync
+# across the mesh — even if the Service object itself carries
+# `service.cilium.io/global: "true"`. The namespace annotation is
+# load-bearing; once present, Cilium auto-applies the service-level
+# semantics to all services in that namespace.
+#
+# This script is invoked via `Method: Exec` from each scale-test scenario's
+# top-level CL2 config (event-throughput.yaml, pod-churn-*.yaml). It runs
+# AFTER CL2 has created the test namespaces (`<prefix>-1..N`) and BEFORE the
+# workload deploy phase, so cross-cluster sync is enabled from the first
+# resource creation.
+#
+# The pre-staged kubectl binary at /root/perf-tests/clusterloader2/config/kubectl
+# (set up by steps/engine/clusterloader2/clustermesh-scale/execute.yml) is
+# used because the CL2 image does not bundle kubectl.
+#
+# Positional args:
+#   $1 NAMESPACE_COUNT   How many namespaces (matches CL2's `namespace.number`).
+#   $2 NAMESPACE_PREFIX  Namespace prefix (matches CL2's `namespace.prefix`).
+
+set -u
+set -o pipefail
+
+NAMESPACE_COUNT="${1:-0}"
+NAMESPACE_PREFIX="${2:-}"
+
+if [ -z "${NAMESPACE_PREFIX}" ] || [ "${NAMESPACE_COUNT}" -lt 1 ]; then
+  echo "annotate-namespaces ERROR: need positional args (count, prefix); got count='${NAMESPACE_COUNT}' prefix='${NAMESPACE_PREFIX}'"
+  exit 2
+fi
+
+# Prefer PATH kubectl, fall back to the pre-staged binary the pipeline
+# downloads into the bind-mounted config dir. Mirrors pod-churn-killer.sh's
+# fallback path so both scripts behave consistently if the CL2 image
+# eventually starts bundling kubectl.
+if command -v kubectl >/dev/null 2>&1; then
+  KUBECTL=kubectl
+elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then
+  KUBECTL=/root/perf-tests/clusterloader2/config/kubectl
+  echo "annotate-namespaces: using pre-staged kubectl at ${KUBECTL}"
+else
+  echo "annotate-namespaces ERROR: kubectl not in PATH and pre-staged binary missing"
+  exit 127
+fi
+
+ANNOTATION="clustermesh.cilium.io/global=true"
+echo "annotate-namespaces: applying ${ANNOTATION} to ${NAMESPACE_COUNT} namespaces with prefix '${NAMESPACE_PREFIX}'"
+
+FAIL_COUNT=0
+for i in $(seq 1 "${NAMESPACE_COUNT}"); do
+  NS="${NAMESPACE_PREFIX}-${i}"
+  # --overwrite tolerates re-runs (CL2 retries, multi-step configs). The
+  # namespace MUST already exist — CL2 creates managed namespaces before
+  # the first test step runs. If it's missing here, that's a real bug
+  # worth surfacing as an error (don't --ignore-not-found).
+  if "${KUBECTL}" annotate namespace "${NS}" "${ANNOTATION}" --overwrite >/dev/null 2>&1; then
+    echo "annotate-namespaces: ${NS} annotated"
+  else
+    echo "annotate-namespaces ERROR: failed to annotate ${NS}"
+    FAIL_COUNT=$((FAIL_COUNT + 1))
+  fi
+done
+
+if [ "${FAIL_COUNT}" -gt 0 ]; then
+  echo "annotate-namespaces: ${FAIL_COUNT}/${NAMESPACE_COUNT} namespaces failed annotation"
+  exit 1
+fi
+
+echo "annotate-namespaces: done, ${NAMESPACE_COUNT} namespaces annotated"
+exit 0
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure-killer.sh b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure-killer.sh
new file mode 100755
index 0000000000..363f9bbb54
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure-killer.sh
@@ -0,0 +1,253 @@
+#!/bin/bash
+# Scenario #4 (ClusterMesh APIServer Failure) — kills clustermesh-apiserver
+# pod on the designated target cluster, then waits for the replacement pod
+# to reach Ready. Records timestamps for post-hoc recovery-time analysis.
+#
+# Per-cluster CL2 execution model: this script runs from inside EVERY
+# cluster's CL2 docker container, but no-ops on non-target clusters. The
+# target is identified by `kubectl config current-context` — `az aks
+# get-credentials` writes context = AKS cluster name (e.g. "clustermesh-1"),
+# which matches what we pass as the target arg.
+#
+# Positional args:
+#   $1 TARGET_CONTEXT             kubectl context name of the target cluster
+#                                 (e.g. "clustermesh-1"). Skip if mismatched.
+#   $2 RECOVERY_TIMEOUT_SECONDS   How long to wait for replacement pod Ready.
+#   $3 REPORT_DIR                 (optional) Path inside the CL2 container
+#                                 where the timing JSON is written. Defaults
+#                                 to /root/perf-tests/clusterloader2/results.
+#
+# Output:
+#   Writes $REPORT_DIR/ApiserverFailureTimings_<context>.json (target only).
+#   scale.py collect reads this file and emits an ApiserverFailureRecoveryTiming
+#   row into the aggregated JSONL.
+#
+# Exit codes:
+#   0 — non-target (no-op) OR target with verified kill + recovery.
+#   1 — target attempt failed somewhere (no pod matched, kubectl failed,
+#       recovery timeout). Writes the timing file with `recovered:false`
+#       so collect can still surface that the scenario was attempted.
+
+set -uo pipefail
+
+TARGET_CONTEXT="${1:-clustermesh-1}"
+RECOVERY_TIMEOUT_SECONDS="${2:-120}"
+REPORT_DIR="${3:-/root/perf-tests/clusterloader2/results}"
+
+# Same fallback pattern as pod-churn-killer.sh — prefer PATH kubectl, fall
+# back to the pre-staged binary at the bind-mounted config dir.
+if command -v kubectl >/dev/null 2>&1; then
+  KUBECTL=kubectl
+elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then
+  KUBECTL=/root/perf-tests/clusterloader2/config/kubectl
+  echo "apiserver-failure-killer: using pre-staged kubectl at ${KUBECTL}"
+else
+  echo "apiserver-failure-killer ERROR: kubectl not in PATH and pre-staged binary missing"
+  exit 127
+fi
+
+CURRENT_CONTEXT=$("${KUBECTL}" config current-context 2>/dev/null || echo "unknown")
+echo "apiserver-failure-killer: current=${CURRENT_CONTEXT} target=${TARGET_CONTEXT}"
+
+if [ "${CURRENT_CONTEXT}" != "${TARGET_CONTEXT}" ]; then
+  echo "apiserver-failure-killer: not target cluster, no-op"
+  exit 0
+fi
+
+# ----- Target cluster path -----
+mkdir -p "${REPORT_DIR}"
+TIMING_FILE="${REPORT_DIR}/ApiserverFailureTimings_${CURRENT_CONTEXT}.json"
+
+write_timing() {
+  # Args: t0_epoch t1_epoch_or_zero recovered_flag pod_name pod_uid_old pod_uid_new note
+  local t0="$1" t1="$2" recovered="$3" pod_name="$4" uid_old="$5" uid_new="$6" note="$7"
+  local dur=0
+  if [ "${t1}" -gt 0 ] && [ "${t0}" -gt 0 ]; then
+    dur=$((t1 - t0))
+  fi
+  cat > "${TIMING_FILE}" <<EOF
+{
+  "target_context": "${CURRENT_CONTEXT}",
+  "t0_kill_epoch": ${t0},
+  "t1_recovered_epoch": ${t1},
+  "recovery_duration_seconds": ${dur},
+  "recovered": ${recovered},
+  "killed_pod_name": "${pod_name}",
+  "killed_pod_uid": "${uid_old}",
+  "replacement_pod_uid": "${uid_new}",
+  "pre_kill_replicas": ${PRE_KILL_REPLICAS:-0},
+  "ready_pods_at_kill": ${READY_PODS_AT_KILL:-0},
+  "note": "${note}"
+}
+EOF
+  echo "apiserver-failure-killer: wrote ${TIMING_FILE}"
+}
+
+# 1. Capture pre-kill state: ALL clustermesh-apiserver pods (name=uid=ready),
+#    not just the first. With HA replicas>1 (scenario #7), the wait-for-new-pod
+#    loop must distinguish "new replacement pod" from "the OTHER surviving
+#    replicas that were already Ready before the kill" — a single-UID compare
+#    matches the surviving pods immediately and falsely reports recovered=0s.
+#    Rubber-duck critique blocker #2.
+PRE_KILL_PODS=$("${KUBECTL}" -n kube-system get pods \
+  -l k8s-app=clustermesh-apiserver \
+  -o 'jsonpath={range .items[*]}{.metadata.name}={.metadata.uid}={.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
+  2>/dev/null | grep -v '^$')
+
+if [ -z "${PRE_KILL_PODS}" ]; then
+  echo "apiserver-failure-killer ERROR: no clustermesh-apiserver pod matched label selector"
+  PRE_KILL_REPLICAS=0
+  READY_PODS_AT_KILL=0
+  write_timing 0 0 false "" "" "" "no pod matched label selector k8s-app=clustermesh-apiserver"
+  exit 1
+fi
+
+PRE_KILL_REPLICAS=$(echo "${PRE_KILL_PODS}" | wc -l | tr -d ' ')
+READY_PODS_AT_KILL=$(echo "${PRE_KILL_PODS}" | awk -F'=' '$3=="True"{c++} END{print c+0}')
+# Newline-separated list of pre-kill UIDs — used to filter the recovery
+# wait loop's candidate set.
+PRE_KILL_UIDS=$(echo "${PRE_KILL_PODS}" | awk -F'=' '{print $2}')
+
+# Pick the first Ready pod as the kill target (preserves prior behavior for
+# scenario #4). If no Ready pod, fall back to first pod.
+TARGET_LINE=$(echo "${PRE_KILL_PODS}" | awk -F'=' '$3=="True"{print; exit}')
+if [ -z "${TARGET_LINE}" ]; then
+  TARGET_LINE=$(echo "${PRE_KILL_PODS}" | head -1)
+fi
+POD_NAME="${TARGET_LINE%%=*}"
+_REST="${TARGET_LINE#*=}"
+POD_UID="${_REST%=*}"
+echo "apiserver-failure-killer: pre-kill replicas=${PRE_KILL_REPLICAS} ready=${READY_PODS_AT_KILL}"
+echo "apiserver-failure-killer: target pod ${POD_NAME} uid=${POD_UID}"
+
+# 2. Delete exactly that pod by name (not by label selector — prevents
+#    accidental multi-pod kill on future HA setups).
+T0=$(date +%s)
+echo "apiserver-failure-killer: t0=${T0} deleting pod ${POD_NAME} (hard kill, --grace-period=0 --force)"
+if ! "${KUBECTL}" -n kube-system delete pod "${POD_NAME}" \
+    --grace-period=0 --force >/dev/null 2>&1; then
+  echo "apiserver-failure-killer ERROR: kubectl delete pod ${POD_NAME} failed"
+  write_timing "${T0}" 0 false "${POD_NAME}" "${POD_UID}" "" "kubectl delete failed"
+  exit 1
+fi
+
+# 3. Wait for replacement pod to reach Ready. Per rubber-duck #6:
+#    Ready (not just Running) is what matters — apiserver may be Running
+#    while still loading certs / unable to serve mesh traffic.
+#
+# Periodic state samples (every 30s) write to a diag log so we can see
+# what kubelet/scheduler/operator were doing during recovery — instead
+# of just "timed out" with no signal.
+DIAG_LOG="${REPORT_DIR}/ApiserverFailureDiag_${CURRENT_CONTEXT}.log"
+: > "${DIAG_LOG}"
+
+dump_state() {
+  local label="$1"
+  {
+    echo "===== ${label} at $(date -u +"%Y-%m-%dT%H:%M:%SZ") (epoch=$(date +%s)) ====="
+    echo "--- pods (k8s-app=clustermesh-apiserver) ---"
+    "${KUBECTL}" -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide 2>&1 || true
+    echo "--- pod UIDs + readiness ---"
+    "${KUBECTL}" -n kube-system get pods -l k8s-app=clustermesh-apiserver \
+      -o 'jsonpath={range .items[*]}{.metadata.name}{" uid="}{.metadata.uid}{" phase="}{.status.phase}{" ready="}{.status.conditions[?(@.type=="Ready")].status}{" reason="}{.status.conditions[?(@.type=="Ready")].reason}{"\n"}{end}' 2>&1 || true
+    # tee'd to BOTH the file AND stdout so the AzDO step log carries the
+    # same diag info as the file. AzDO pipeline artifacts aren't published
+    # for our scenarios — the agent's report dir is torn down with the job
+    # — so without stdout duplication the diag is unreachable.
+  } 2>&1 | tee -a "${DIAG_LOG}"
+}
+
+RECOVERY_DEADLINE=$((T0 + RECOVERY_TIMEOUT_SECONDS))
+NEW_POD_NAME=""
+NEW_POD_UID=""
+NEXT_SAMPLE=$((T0 + 30))
+while [ "$(date +%s)" -lt "${RECOVERY_DEADLINE}" ]; do
+  # Find any clustermesh-apiserver pod whose UID is NEW (not in the pre-kill
+  # UID set) AND whose Ready condition is True.
+  #
+  # BUG-FIX 2026-05-13a: original kubectl jsonpath nested `[?]` filter is
+  # broken — switched to shell-side filter listing all pods.
+  #
+  # BUG-FIX 2026-05-13b: original filter compared against a SINGLE killed-pod
+  # UID. With HA replicas>1 (scenario #7), the surviving N-1 replicas already
+  # have different UIDs and are Ready, so the filter would match one of them
+  # instantly → false `recovered after 0s`. Rubber-duck critique blocker #2.
+  # Fix: filter against the pre-kill UID set (every pod present at kill time),
+  # so only a genuinely new replacement pod passes.
+  ALL_PODS=$("${KUBECTL}" -n kube-system get pods \
+    -l k8s-app=clustermesh-apiserver \
+    -o 'jsonpath={range .items[*]}{.metadata.name}={.metadata.uid}={.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
+    2>/dev/null | grep -v '^$' | grep '=True$')
+  CANDIDATE=""
+  if [ -n "${ALL_PODS}" ]; then
+    while IFS= read -r _line; do
+      [ -z "${_line}" ] && continue
+      # _line format: name=uid=True
+      _name_uid="${_line%=*}"          # name=uid
+      _uid="${_name_uid#*=}"           # uid
+      _in_set=0
+      for _old_uid in ${PRE_KILL_UIDS}; do
+        if [ "${_uid}" = "${_old_uid}" ]; then
+          _in_set=1
+          break
+        fi
+      done
+      if [ "${_in_set}" -eq 0 ]; then
+        CANDIDATE="${_line}"
+        break
+      fi
+    done <<EOF
+${ALL_PODS}
+EOF
+  fi
+  if [ -n "${CANDIDATE}" ]; then
+    NAME_UID="${CANDIDATE%=*}"
+    NEW_POD_NAME="${NAME_UID%=*}"
+    NEW_POD_UID="${NAME_UID#*=}"
+    break
+  fi
+  # Periodic state sample for diagnostics.
+  NOW=$(date +%s)
+  if [ "${NOW}" -ge "${NEXT_SAMPLE}" ]; then
+    dump_state "RECOVERY-WAIT sample (elapsed=$((NOW - T0))s)"
+    NEXT_SAMPLE=$((NOW + 30))
+  fi
+  sleep 2
+done
+
+T1=$(date +%s)
+if [ -z "${NEW_POD_UID}" ]; then
+  echo "apiserver-failure-killer WARN: recovery timeout after ${RECOVERY_TIMEOUT_SECONDS}s; no NEW Ready pod"
+  # Final diag dump on timeout — describe deployment, latest pod, recent events.
+  # tee'd so AzDO step log AND the file both contain the diag (see dump_state
+  # comment for why duplication matters).
+  {
+    echo "===== TIMEOUT FINAL DIAG at $(date -u +"%Y-%m-%dT%H:%M:%SZ") ====="
+    echo "--- describe deployment clustermesh-apiserver ---"
+    "${KUBECTL}" -n kube-system describe deployment clustermesh-apiserver 2>&1 || true
+    echo "--- describe ALL clustermesh-apiserver pods ---"
+    for p in $("${KUBECTL}" -n kube-system get pods -l k8s-app=clustermesh-apiserver -o name 2>/dev/null); do
+      echo "--- $p ---"
+      "${KUBECTL}" -n kube-system describe "$p" 2>&1 || true
+    done
+    echo "--- recent kube-system events ---"
+    "${KUBECTL}" -n kube-system get events --sort-by=.lastTimestamp 2>&1 | tail -50 || true
+  } 2>&1 | tee -a "${DIAG_LOG}"
+  echo "apiserver-failure-killer: diag dump written to ${DIAG_LOG}"
+  write_timing "${T0}" 0 false "${POD_NAME}" "${POD_UID}" "" "recovery timeout"
+  # Phase 4b: exit 0 on timeout (NOT 1). The timing JSON with
+  # `recovered:false` is the load-bearing signal that the scenario was
+  # attempted but did not recover within budget — Kusto queries on
+  # ApiserverFailureRecoveryTiming.recovered will flag this. Exiting 1
+  # here would cascade-fail the CL2 step → execute.yml's overall_rc=1 →
+  # share-infra step exits with SucceededWithIssues at worst, but
+  # peer-cluster measurements (which DID gather data about the failure
+  # event) would also be wasted. Soft-fail is correct: rubber-duck
+  # critique #10 confirmed.
+  exit 0
+fi
+
+DUR=$((T1 - T0))
+echo "apiserver-failure-killer: recovered after ${DUR}s; new pod ${NEW_POD_NAME} uid=${NEW_POD_UID}"
+write_timing "${T0}" "${T1}" true "${POD_NAME}" "${POD_UID}" "${NEW_POD_UID}" "ok"
+exit 0
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure.yaml b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure.yaml
new file mode 100644
index 0000000000..f444e6fd4d
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/apiserver-failure.yaml
@@ -0,0 +1,231 @@
+name: clustermesh-apiserver-failure
+
+# Scale scenario #4: ClusterMesh APIServer Failure.
+#
+# Goal (scale testing.txt line 80-91): validate resilience and recovery
+# behavior when ONE clustermesh-apiserver pod dies in a meshed cluster.
+# Measure detection time (how fast peers notice), recovery time (how fast
+# the pod is replaced + serving), backlog drain time (how fast queues
+# clear after recovery).
+#
+# Single-cluster failure pattern: kill the apiserver pod on a designated
+# target cluster (default "clustermesh-1"). Other clusters' CL2 invocations
+# run the same script but no-op based on `kubectl config current-context`
+# comparison. The target cluster's killer records t0/t1 timestamps in a
+# JSON file at the report dir; scale.py collect picks it up and surfaces
+# the timing as an `ApiserverFailureRecoveryTiming` row in the JSONL.
+#
+# Per-cluster Prometheus must be running on every cluster DURING the kill
+# for peer-side observations to land. With CL2_MAX_CONCURRENT < mesh_size,
+# only some clusters' Prom are active simultaneously; at n=2/n=5 this is
+# fine (concurrency=4 default >= cluster count), but at n=20 we may need
+# to bump max_concurrent or accept partial peer observation. See plan.md
+# Phase 4b notes.
+#
+# Sequence:
+#   1. Annotate workload namespaces (CFP-39876 opt-in).
+#   2. Start measurements.
+#   3. Deploy PodMonitor + workload (200 pods + global services, same
+#      pattern as event-throughput).
+#   4. Initial WaitForControlledPodsRunning gate.
+#   5. Warmup sleep — mesh stabilizes.
+#   6. Method:Exec → apiserver-failure-killer.sh. On target cluster:
+#      verifies pod identity, hard-kills it, waits for new Ready pod,
+#      writes timing JSON. On non-target clusters: no-op.
+#   7. Observation sleep — let detection + recovery happen.
+#   8. Settle sleep — backlog drain.
+#   9. Gather measurements (mirrors start).
+#   10. Teardown.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "60s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "60s"}}
+{{$apiserverKillTargetContext := DefaultParam .CL2_APISERVER_KILL_TARGET_CONTEXT "clustermesh-1"}}
+{{$apiserverKillRecoveryTimeoutSeconds := DefaultParam .CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS 240}}
+{{$apiserverKillObservationSeconds := DefaultParam .CL2_APISERVER_KILL_OBSERVATION_SECONDS 60}}
+
+{{$workloadGroup := "clustermesh-apiserver-failure"}}
+{{$workloadBasename := "apf"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-apf
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  # ----- CFP-39876 opt-in: annotate workload namespaces -----
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-apf"
+
+  # ----- Start measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/apiserver-failure.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  # ----- Workload deploy + initial settle -----
+  - name: Start tracking apiserver-failure Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-apf-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Wait for initial apiserver-failure pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-apf-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before kill
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  # ----- KILL APISERVER (target cluster only) -----
+  - name: Kill apiserver on target cluster
+    measurements:
+      - Identifier: ApiserverFailureKiller
+        Method: Exec
+        Params:
+          streamOutput: true
+          # Generous timeout: covers warmup-budget-exceeded + recovery_timeout
+          # + slow pod schedule. Worst-case ~3min.
+          timeout: 5m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/apiserver-failure-killer.sh
+          - "{{$apiserverKillTargetContext}}"
+          - "{{$apiserverKillRecoveryTimeoutSeconds}}"
+
+  # ----- Observation window: peers detect failure, then see recovery -----
+  - name: Observe during failure + recovery
+    measurements:
+      - Identifier: ObservationSleep
+        Method: Sleep
+        Params:
+          duration: {{$apiserverKillObservationSeconds}}s
+
+  # ----- Settle: backlog drain post-recovery -----
+  - name: Settle for backlog drain
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  # ----- Gather measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/apiserver-failure.yaml
+      params:
+        action: gather
+
+  # ----- Teardown -----
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml
index 439fdc4e71..bbb6327e92 100644
--- a/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml
+++ b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml
@@ -47,6 +47,25 @@ tuningSets:
       qps: {{$apiServerCallsPerSecond}}
 
 steps:
+  # ----- ACNS namespace opt-in (CFP-39876) -----
+  # AKS-managed Cilium ships with clustermesh-default-global-namespace=false,
+  # so workload namespaces need clustermesh.cilium.io/global=true to sync
+  # their CiliumIdentity/Endpoint/Services across the mesh. Without this,
+  # cross-cluster propagation metrics are structurally 0. See plan.md
+  # note #14 + ACNS team confirmation 2026-05-11.
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-et"
+
   # ----- Start measurements -----
   - module:
       path: /modules/measurements/control-plane.yaml
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/ha-config-scaler.sh b/modules/python/clusterloader2/clustermesh-scale/config/ha-config-scaler.sh
new file mode 100755
index 0000000000..fc91a6fc05
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/ha-config-scaler.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+# Scenario #7 (HA Configuration Validation) — scales the clustermesh-apiserver
+# Deployment up/down to compare resource overhead, failover behavior, and event
+# duplication between single-replica and multi-replica HA configurations.
+#
+# Unlike apiserver-failure-killer.sh (which targets a single cluster), this
+# script runs on EVERY cluster's CL2 instance and scales each cluster's own
+# clustermesh-apiserver. Mesh-wide HA is the realistic production config; only
+# scaling one cluster would conflate HA-overhead measurements with a
+# single-cluster outlier.
+#
+# Positional args:
+#   $1 ACTION       scale-up | scale-down
+#   $2 REPLICAS     Target replicas count (required for scale-up; ignored for
+#                   scale-down which always restores to 1).
+#   $3 REPORT_DIR   (optional) Path inside the CL2 container where timing JSON
+#                   is written. Defaults to /root/perf-tests/clusterloader2/results.
+#
+# Output:
+#   On scale-up only, writes $REPORT_DIR/HAConfigScalingTimings_<context>.json
+#   with the scale duration, observed spec/ready replicas, and a
+#   ha_replicas_honored flag (true iff spec==REPLICAS AND ready==REPLICAS at
+#   the end of a 30s post-rollout poll window — catches ENO revert).
+#   scale.py collect emits one HAConfigScalingTiming JSONL row per file.
+#
+# Exit codes:
+#   0 — always (soft-fail). Scale-up failures still emit the timing file with
+#   ha_replicas_honored:false so Kusto queries can flag degraded HA runs.
+
+set -uo pipefail
+
+ACTION="${1:?action required: scale-up|scale-down}"
+REPLICAS="${2:-1}"
+REPORT_DIR="${3:-/root/perf-tests/clusterloader2/results}"
+
+# kubectl resolution: PATH first, then pre-staged binary (same pattern as
+# apiserver-failure-killer.sh and pod-churn-killer.sh).
+if command -v kubectl >/dev/null 2>&1; then
+  KUBECTL=kubectl
+elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then
+  KUBECTL=/root/perf-tests/clusterloader2/config/kubectl
+  echo "ha-config-scaler: using pre-staged kubectl at ${KUBECTL}"
+else
+  echo "ha-config-scaler ERROR: kubectl not in PATH and pre-staged binary missing"
+  exit 0
+fi
+
+CURRENT_CONTEXT=$("${KUBECTL}" config current-context 2>/dev/null || echo "unknown")
+mkdir -p "${REPORT_DIR}"
+TIMING_FILE="${REPORT_DIR}/HAConfigScalingTimings_${CURRENT_CONTEXT}.json"
+
+emit_timing() {
+  # Args: action requested_replicas spec_replicas_after ready_replicas_after honored duration_s note
+  local action="$1" requested="$2" spec_after="$3" ready_after="$4"
+  local honored="$5" dur="$6" note="$7"
+  cat > "${TIMING_FILE}" <<EOF
+{
+  "context": "${CURRENT_CONTEXT}",
+  "action": "${action}",
+  "requested_replicas": ${requested},
+  "spec_replicas_after": ${spec_after},
+  "ready_replicas_after": ${ready_after},
+  "ha_replicas_honored": ${honored},
+  "scale_duration_seconds": ${dur},
+  "note": "${note}"
+}
+EOF
+  echo "ha-config-scaler: wrote ${TIMING_FILE}"
+}
+
+get_spec_ready() {
+  # Echoes "spec ready" (two integers separated by a space). Missing values
+  # become 0 (jsonpath returns empty string when readyReplicas is not yet set).
+  local spec ready
+  spec=$("${KUBECTL}" -n kube-system get deployment clustermesh-apiserver \
+    -o jsonpath='{.spec.replicas}' 2>/dev/null || echo 0)
+  ready=$("${KUBECTL}" -n kube-system get deployment clustermesh-apiserver \
+    -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo 0)
+  echo "${spec:-0} ${ready:-0}"
+}
+
+T0=$(date +%s)
+
+case "${ACTION}" in
+  scale-up)
+    echo "ha-config-scaler: scale-up clustermesh-apiserver to ${REPLICAS} replicas on ${CURRENT_CONTEXT}"
+    if ! "${KUBECTL}" -n kube-system scale deployment clustermesh-apiserver \
+        --replicas="${REPLICAS}" >/dev/null 2>&1; then
+      echo "ha-config-scaler WARN: kubectl scale command failed"
+      emit_timing "scale-up" "${REPLICAS}" 0 0 false 0 "kubectl scale failed"
+      exit 0
+    fi
+
+    # Phase 1: wait for spec.replicas==REPLICAS AND status.readyReplicas==REPLICAS.
+    # 240s budget covers initial image pull + ENI attach on AKS-managed Cilium
+    # (we observed 30-60s pod schedule + 60s pull for single-pod recovery; HA
+    # rollouts are sequential per RollingUpdate strategy).
+    ROLLOUT_DEADLINE=$((T0 + 240))
+    spec=0
+    ready=0
+    while [ "$(date +%s)" -lt "${ROLLOUT_DEADLINE}" ]; do
+      read -r spec ready <<<"$(get_spec_ready)"
+      if [ "${spec}" -eq "${REPLICAS}" ] && [ "${ready}" -eq "${REPLICAS}" ]; then
+        break
+      fi
+      sleep 2
+    done
+
+    if [ "${spec}" -ne "${REPLICAS}" ] || [ "${ready}" -ne "${REPLICAS}" ]; then
+      T1=$(date +%s)
+      DUR=$((T1 - T0))
+      echo "ha-config-scaler WARN: rollout did not reach ${REPLICAS} replicas after ${DUR}s (spec=${spec} ready=${ready})"
+      emit_timing "scale-up" "${REPLICAS}" "${spec}" "${ready}" false "${DUR}" "rollout timeout"
+      exit 0
+    fi
+
+    # Phase 2: ENO-revert detection. AKS-managed Cilium tags the Deployment
+    # with `app.kubernetes.io/actually-managed-by=Eno`; the ENO operator
+    # reconciles to desired state on its own cadence. If it reverts our
+    # scale within 30s of rollout completion, the rest of the scenario will
+    # run on degraded replicas — useful to record but not useful for HA A/B
+    # comparison.
+    REVERT_DEADLINE=$(($(date +%s) + 30))
+    honored=true
+    final_spec=${spec}
+    final_ready=${ready}
+    while [ "$(date +%s)" -lt "${REVERT_DEADLINE}" ]; do
+      read -r final_spec final_ready <<<"$(get_spec_ready)"
+      if [ "${final_spec}" -ne "${REPLICAS}" ]; then
+        honored=false
+        echo "ha-config-scaler WARN: ENO reverted scale within 30s — spec=${final_spec}"
+        break
+      fi
+      sleep 2
+    done
+
+    T1=$(date +%s)
+    DUR=$((T1 - T0))
+    NOTE="ok"
+    [ "${honored}" = "false" ] && NOTE="enor_reverted"
+    emit_timing "scale-up" "${REPLICAS}" "${final_spec}" "${final_ready}" "${honored}" "${DUR}" "${NOTE}"
+    echo "ha-config-scaler: scale-up complete in ${DUR}s, spec=${final_spec} ready=${final_ready} honored=${honored}"
+    ;;
+
+  scale-down)
+    echo "ha-config-scaler: scale-down clustermesh-apiserver to 1 replica on ${CURRENT_CONTEXT} (cleanup)"
+    # Best-effort. Failure here is non-blocking — the cluster is about to be
+    # destroyed anyway. We do NOT overwrite the scale-up timing JSON.
+    "${KUBECTL}" -n kube-system scale deployment clustermesh-apiserver \
+      --replicas=1 >/dev/null 2>&1 || true
+    read -r spec ready <<<"$(get_spec_ready)"
+    echo "ha-config-scaler: scale-down attempted; current spec=${spec} ready=${ready}"
+    ;;
+
+  *)
+    echo "ha-config-scaler ERROR: unknown action '${ACTION}' (expected scale-up|scale-down)"
+    exit 0
+    ;;
+esac
+
+exit 0
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/ha-config.yaml b/modules/python/clusterloader2/clustermesh-scale/config/ha-config.yaml
new file mode 100644
index 0000000000..c0f812a81b
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/ha-config.yaml
@@ -0,0 +1,264 @@
+name: clustermesh-ha-config
+
+# Scale scenario #7: HA Configuration Validation.
+#
+# Goal (scale testing.txt line 115-126): compare identical workloads with
+# clustermesh-apiserver replicas=1 (baseline = scenario #4) vs replicas=N
+# (HA on). Measure resource overhead, failover time, event duplication.
+#
+# Design: this scenario clones scenario #4 (apiserver-failure) and adds two
+# new pre/post steps:
+#   - BEFORE measurements start: scale clustermesh-apiserver Deployment on
+#     EVERY cluster to CL2_HA_CONFIG_REPLICAS (default 3). Mesh-wide HA is
+#     the realistic production config; scaling only the target cluster would
+#     conflate HA-overhead measurements with single-cluster outliers.
+#   - AFTER gather: scale back to 1 replica (cleanup). Best-effort; the
+#     cluster is destroyed shortly after anyway.
+#
+# Cross-scenario A/B in Kusto: query rows where test_type in
+# ("apiserver-failure","ha-config"), join on cluster + measurement.
+#
+# - apiserver-failure-killer.sh is reused for the kill phase. It correctly
+#   handles HA replicas now (pre-kill UID set capture + Ready filter against
+#   that set — see commit "phase 4b: fix apiserver-failure killer
+#   false-success with HA replicas").
+# - ha-config-scaler.sh handles the scale-up/scale-down + ENO-revert
+#   detection (timing JSON tags ha_replicas_honored true|false).
+#
+# Sequence:
+#   1. Annotate workload namespaces (CFP-39876 opt-in).
+#   2. HA SCALE-UP: every cluster scales clustermesh-apiserver to N replicas.
+#   3. Start measurements.
+#   4. Deploy PodMonitor + workload (200 pods + global services).
+#   5. Initial WaitForControlledPodsRunning gate.
+#   6. Warmup sleep.
+#   7. Method:Exec → apiserver-failure-killer.sh. On target cluster: kills
+#      ONE of N pods; survivors should continue serving (HA invariant).
+#      On non-target clusters: no-op.
+#   8. Observation sleep.
+#   9. Settle sleep.
+#   10. Gather measurements.
+#   11. HA SCALE-DOWN: every cluster scales back to 1 (cleanup).
+#   12. Teardown.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "60s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "60s"}}
+{{$apiserverKillTargetContext := DefaultParam .CL2_APISERVER_KILL_TARGET_CONTEXT "clustermesh-1"}}
+{{$apiserverKillRecoveryTimeoutSeconds := DefaultParam .CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS 240}}
+{{$apiserverKillObservationSeconds := DefaultParam .CL2_APISERVER_KILL_OBSERVATION_SECONDS 60}}
+{{$haConfigReplicas := DefaultParam .CL2_HA_CONFIG_REPLICAS 3}}
+
+{{$workloadGroup := "clustermesh-ha-config"}}
+{{$workloadBasename := "ha"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-ha
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  # ----- CFP-39876 opt-in: annotate workload namespaces -----
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-ha"
+
+  # ----- HA scale-up (BEFORE start measurements so steady-state HA values
+  # are captured). Every cluster scales its own clustermesh-apiserver.
+  - name: Scale clustermesh-apiserver to HA replicas
+    measurements:
+      - Identifier: HAConfigScaler-up
+        Method: Exec
+        Params:
+          streamOutput: true
+          # Generous timeout: 240s rollout + 30s revert-check + slack.
+          timeout: 6m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/ha-config-scaler.sh
+          - scale-up
+          - "{{$haConfigReplicas}}"
+
+  # ----- Start measurements (with HA replicas already in place) -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/apiserver-failure.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  # ----- Workload deploy + initial settle -----
+  - name: Start tracking ha-config Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-ha-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Wait for initial ha-config pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-ha-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before kill
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  # ----- KILL one of N apiserver replicas (target cluster only) -----
+  - name: Kill apiserver on target cluster (1 of N replicas)
+    measurements:
+      - Identifier: ApiserverFailureKiller
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 5m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/apiserver-failure-killer.sh
+          - "{{$apiserverKillTargetContext}}"
+          - "{{$apiserverKillRecoveryTimeoutSeconds}}"
+
+  # ----- Observation: HA invariant should keep remote-clusters-connected
+  # at max (cluster_count-1) throughout; scenario #4 baseline dips during
+  # the kill window.
+  - name: Observe during failure + recovery (HA invariant test)
+    measurements:
+      - Identifier: ObservationSleep
+        Method: Sleep
+        Params:
+          duration: {{$apiserverKillObservationSeconds}}s
+
+  - name: Settle for backlog drain
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  # ----- Gather measurements (HA still active) -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/apiserver-failure.yaml
+      params:
+        action: gather
+
+  # ----- HA scale-down (cleanup) -----
+  - name: Scale clustermesh-apiserver back to 1 replica
+    measurements:
+      - Identifier: HAConfigScaler-down
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 3m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/ha-config-scaler.sh
+          - scale-down
+
+  # ----- Teardown -----
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/isolation-churn.sh b/modules/python/clusterloader2/clustermesh-scale/config/isolation-churn.sh
new file mode 100755
index 0000000000..4dbf293386
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/isolation-churn.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Scenario #5 (Multi-Cluster Failure Isolation) — drives heavy pod-churn on
+# ONLY the target cluster; peer clusters run a no-op observe path that
+# sleeps for the same duration so their CL2 lifecycle (and Prometheus
+# scrape window) covers the target's churn period.
+#
+# Why peer must sleep (not exit immediately): in share-infra mode, each
+# scenario runs CL2 in parallel on every cluster. If peer exits the
+# Method:Exec at t=0s, peer CL2 advances straight into settle + gather +
+# teardown, finishing in ~3min — but target is still churning at t=10min.
+# Peer Prometheus is torn down before target's churn finishes. To compare
+# "did peers spike while target churned?" the peer Prometheus window must
+# overlap target's churn window. Sleeping in this script keeps both
+# lifecycles aligned.
+#
+# Positional args (all forwarded to pod-churn-killer.sh on target):
+#   $1 TARGET_CONTEXT          kubectl context name of the cluster to churn.
+#   $2 KILL_DURATION_SECONDS   Total kill-loop runtime on target (also peer sleep).
+#   $3 KILL_INTERVAL_SECONDS   Seconds between kill rounds on target.
+#   $4 KILL_BATCH              Pods deleted per round on target.
+#   $5 WORKLOAD_GROUP          Label-selector group value for pod selection.
+#
+# Exit codes:
+#   0 — always (target completes normally OR peer no-op observes for the
+#   configured duration). Soft-fail matches the rest of Phase 4b's
+#   scenario scripts so a single-cluster issue doesn't abort the run.
+
+set -uo pipefail
+
+TARGET_CONTEXT="${1:?target context required}"
+KILL_DURATION_SECONDS="${2:-600}"
+KILL_INTERVAL_SECONDS="${3:-10}"
+KILL_BATCH="${4:-5}"
+WORKLOAD_GROUP="${5:-clustermesh-isolation}"
+
+# kubectl resolution: PATH first, then pre-staged binary (same pattern as
+# apiserver-failure-killer.sh and pod-churn-killer.sh).
+if command -v kubectl >/dev/null 2>&1; then
+  KUBECTL=kubectl
+elif [ -x /root/perf-tests/clusterloader2/config/kubectl ]; then
+  KUBECTL=/root/perf-tests/clusterloader2/config/kubectl
+  export PATH="/root/perf-tests/clusterloader2/config:${PATH}"
+  echo "isolation-churn: using pre-staged kubectl at ${KUBECTL}"
+else
+  echo "isolation-churn ERROR: kubectl not in PATH and pre-staged binary missing"
+  exit 127
+fi
+
+CURRENT_CONTEXT=$("${KUBECTL}" config current-context 2>/dev/null || echo "unknown")
+echo "isolation-churn: current=${CURRENT_CONTEXT} target=${TARGET_CONTEXT}"
+
+if [ "${CURRENT_CONTEXT}" != "${TARGET_CONTEXT}" ]; then
+  echo "isolation-churn: peer cluster — observing for ${KILL_DURATION_SECONDS}s while target churns"
+  sleep "${KILL_DURATION_SECONDS}"
+  echo "isolation-churn: peer observation window complete"
+  exit 0
+fi
+
+echo "isolation-churn: target cluster — delegating to pod-churn-killer.sh"
+exec bash /root/perf-tests/clusterloader2/config/pod-churn-killer.sh \
+  "${KILL_DURATION_SECONDS}" \
+  "${KILL_INTERVAL_SECONDS}" \
+  "${KILL_BATCH}" \
+  "${WORKLOAD_GROUP}"
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/isolation.yaml b/modules/python/clusterloader2/clustermesh-scale/config/isolation.yaml
new file mode 100644
index 0000000000..d7882415f1
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/isolation.yaml
@@ -0,0 +1,232 @@
+name: clustermesh-isolation
+
+# Scale scenario #5: Multi-Cluster Failure Isolation.
+#
+# Goal (scale testing.txt line 92-102): induce heavy churn in ONE cluster,
+# verify peer clusters remain stable (no cascade in CPU/memory/etc).
+#
+# Topology: every cluster runs the same 200-pod workload + global services.
+# The Method:Exec kill phase routes to the target cluster (default
+# clustermesh-1) only — peer clusters' isolation-churn.sh script sleeps
+# for the same kill duration so their Prometheus scrape window aligns
+# with target's churn window. Without that alignment, peer CL2 would
+# tear down Prometheus before target's churn finishes, destroying the
+# isolation signal.
+#
+# Cross-scenario A/B in Kusto: filter `test_type == "isolation"`, derive
+# `role = iff(cluster == "<target_role>", "target", "peer")`, then
+# compare resource measurements across role. Healthy isolation means
+# peers' CPU/memory/etc are at baseline values during the churn window;
+# cascading failure means peers' resources track target's spikes.
+#
+# Sequence:
+#   1. Annotate workload namespaces (CFP-39876 opt-in).
+#   2. Start measurements.
+#   3. Deploy 200-pod workload + global services on every cluster.
+#   4. Initial WaitForControlledPodsRunning gate.
+#   5. Warmup sleep.
+#   6. Method:Exec → isolation-churn.sh. On target: runs pod-churn-killer.sh
+#      kill loop (delete KILL_BATCH random workload pods every
+#      KILL_INTERVAL_SECONDS for KILL_DURATION_SECONDS). On peers: sleeps
+#      for KILL_DURATION_SECONDS to keep CL2/Prom lifecycle aligned.
+#   7. Settle sleep — backlog drain on target, observe-window close on peers.
+#   8. Gather measurements (peers should be flat; target should show spike).
+#   9. Teardown.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "60s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "60s"}}
+
+# Reuse the same target-context knob as scenario #4 (apiserver-failure):
+# both scenarios target the same cluster by convention. Override via the
+# matrix var if a different target is needed.
+{{$apiserverKillTargetContext := DefaultParam .CL2_APISERVER_KILL_TARGET_CONTEXT "clustermesh-1"}}
+
+# Reuse the pod-churn kill-loop knobs from scenario #2 (pod-churn-combined):
+# semantically identical (kill workload pods at controlled rate). Avoids
+# adding new matrix vars for the same parameter shape.
+{{$killDurationSeconds := DefaultParam .CL2_KILL_DURATION_SECONDS 600}}
+{{$killIntervalSeconds := DefaultParam .CL2_KILL_INTERVAL_SECONDS 10}}
+{{$killBatch := DefaultParam .CL2_KILL_BATCH 5}}
+# Method:Exec timeout — kill duration + 5min headroom (allows peer's sleep
+# to complete + final pod-churn-killer cleanup line).
+{{$killExecTimeout := DefaultParam .CL2_KILL_EXEC_TIMEOUT "15m"}}
+
+{{$workloadGroup := "clustermesh-isolation"}}
+{{$workloadBasename := "iso"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-iso
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  # ----- CFP-39876 opt-in: annotate workload namespaces -----
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-iso"
+
+  # ----- Start measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  # ----- Workload deploy + initial settle -----
+  - name: Start tracking isolation Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-iso-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Wait for initial isolation pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-iso-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before isolation churn
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  # ----- ISOLATION CHURN (target-only kill loop; peers sleep-observe) -----
+  - name: Drive heavy pod-churn on target cluster only
+    measurements:
+      - Identifier: IsolationChurnRunner
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: {{$killExecTimeout}}
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/isolation-churn.sh
+          - "{{$apiserverKillTargetContext}}"
+          - "{{$killDurationSeconds}}"
+          - "{{$killIntervalSeconds}}"
+          - "{{$killBatch}}"
+          - "{{$workloadGroup}}"
+
+  # ----- Settle: backlog drain on target, observe-window close on peers -----
+  - name: Settle after isolation churn
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  # ----- Gather measurements (peer flat-vs-target spike comparison) -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: gather
+
+  # ----- Teardown -----
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml
index 0e0a3e36bd..b192bd3709 100644
--- a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml
@@ -22,6 +22,13 @@ name: clustermesh-event-throughput-workload
 {{$replicasPerDeployment := .replicasPerDeployment}}
 {{$tuningSet := .tuningSet}}
 {{$operationTimeout := .operationTimeout}}
+# Optional suffix for measurement Identifiers. Scenario #6 (upper-bound)
+# calls this module N times per CL2 run (one per saturation rung) with
+# phaseSuffix=Rung0/Rung1/.../RungN-1 so the WaitForControlledPodsRunning
+# Identifiers don't collide across rungs. Default "" keeps existing
+# single-invocation callers (event-throughput.yaml) byte-for-byte
+# identical.
+{{$phaseSuffix := DefaultParam .phaseSuffix ""}}
 
 # delete = bring object count to 0; create/restart keep configured count.
 {{$replicasInPhase := $deploymentsPerNamespace}}
@@ -34,9 +41,9 @@ steps:
   # Identifier keeps the create/restart/delete invocations from clobbering
   # each other's metric state across the three module calls in
   # event-throughput.yaml.
-  - name: Start tracking event-throughput pods to be {{$actionName}}d
+  - name: Start tracking event-throughput pods to be {{$actionName}}d{{if $phaseSuffix}} ({{$phaseSuffix}}){{end}}
     measurements:
-      - Identifier: WaitForControlledPodsRunning-{{$actionName}}
+      - Identifier: WaitForControlledPodsRunning-{{$actionName}}{{$phaseSuffix}}
         Method: WaitForControlledPodsRunning
         Params:
           action: start
@@ -65,9 +72,9 @@ steps:
             templateFillMap:
               Group: clustermesh-event-throughput
 
-  - name: Wait for event-throughput pods to be {{$actionName}}d
+  - name: Wait for event-throughput pods to be {{$actionName}}d{{if $phaseSuffix}} ({{$phaseSuffix}}){{end}}
     measurements:
-      - Identifier: WaitForControlledPodsRunning-{{$actionName}}
+      - Identifier: WaitForControlledPodsRunning-{{$actionName}}{{$phaseSuffix}}
         Method: WaitForControlledPodsRunning
         Params:
           action: gather
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/apiserver-failure.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/apiserver-failure.yaml
new file mode 100644
index 0000000000..9bc2234291
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/apiserver-failure.yaml
@@ -0,0 +1,97 @@
+{{$action := .action}} # start, gather
+
+{{$suffix := DefaultParam .suffix ""}}
+
+# Scale scenario #4 (ClusterMesh APIServer Failure) — measurements scoped
+# to the failure window. Captures peer-cluster behavior (drop in
+# remote_clusters gauge, spike in failure-counter rate, kvstore catch-up
+# latency) over the run window. The actual t0/t1 timestamps come from
+# apiserver-failure-killer.sh's timing JSON file (collected separately).
+#
+# PromQL note on time-of-event signals: vanilla Prometheus doesn't expose
+# "time at which X first happened" cleanly. Detection time and recovery
+# time are computed post-hoc in Kusto by joining these gauge series with
+# the killer's t0/t1 timestamps. This module captures the windowed
+# aggregates that surface "something disruptive happened" — the explicit
+# timing comes from the timing JSON row.
+
+steps:
+  - name: {{$action}} ApiServer Failure Measurements
+    measurements:
+    # -----------------------------------------------------------------
+    # Detection signal: how low did the remote_clusters gauge dip during
+    # the failure window? Healthy = N-1 (every cluster sees its N-1 peers).
+    # Target's apiserver dies → peer clusters' gauge drops by 1 briefly →
+    # gauge recovers when apiserver is back + reconnects.
+    # -----------------------------------------------------------------
+    - Identifier: RemoteClustersConnectedMinDuringFailure{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh Remote Clusters Connected Min During Failure {{$suffix}}
+        metricVersion: v1
+        unit: "#"
+        enableViolations: false
+        queries:
+        - name: Min
+          query: min(min_over_time(cilium_clustermesh_remote_clusters[%v:]))
+        - name: Perc50
+          query: quantile(0.50, min_over_time(cilium_clustermesh_remote_clusters[%v:]))
+
+    # -----------------------------------------------------------------
+    # Failure-counter rate burst: cilium_clustermesh_remote_cluster_failures
+    # is a monotonic counter. During the failure window, the rate spikes
+    # as peers retry connections to the dead apiserver. Max-over-time of
+    # the 1m-sliding rate is the "peak failure rate" signal.
+    # -----------------------------------------------------------------
+    - Identifier: RemoteClusterFailureRateBurst{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh Remote Cluster Failure Rate Burst {{$suffix}}
+        metricVersion: v1
+        unit: failures/s
+        enableViolations: false
+        queries:
+        - name: Max
+          query: max(max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:]))
+        - name: Perc99
+          query: quantile(0.99, max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:]))
+
+    # -----------------------------------------------------------------
+    # Kvstore sync error burst: spikes when peers can't reach the dead
+    # apiserver. Catch-up rate post-recovery indicates backlog drain
+    # behavior.
+    # -----------------------------------------------------------------
+    - Identifier: KvstoreSyncErrorBurst{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh Kvstore Sync Error Burst {{$suffix}}
+        metricVersion: v1
+        unit: errors/s
+        enableViolations: false
+        queries:
+        - name: Max
+          query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_sync_errors_total[1m])[%v:]))
+        - name: Sum
+          query: sum(max_over_time(rate(cilium_kvstoremesh_kvstore_sync_errors_total[1m])[%v:]))
+
+    # -----------------------------------------------------------------
+    # Kvstore operation latency p99 during recovery: peers re-sync state
+    # after apiserver comes back; the histogram's p99 spike size is the
+    # "catch-up cost" signal.
+    # -----------------------------------------------------------------
+    - Identifier: KvstoreOperationLatencyP99DuringRecovery{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh Kvstore Op Latency P99 During Recovery {{$suffix}}
+        metricVersion: v1
+        unit: s
+        enableViolations: false
+        queries:
+        - name: Perc99
+          query: histogram_quantile(0.99, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le))
+        - name: Perc90
+          query: histogram_quantile(0.90, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le))
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml
index 18d0a2a85c..7f5c9c6cf3 100644
--- a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml
@@ -38,6 +38,17 @@ steps:
     # Mesh failure counter: cumulative remote-cluster connection failures.
     # Healthy runs should keep this at 0; we track the max increase observed
     # over the run to surface flapping links during scale-up.
+    #
+    # Observed N=20 baseline (run 66826-8f280609): MaxIncrease = 4–6 on
+    # EVERY cluster — even green runs. Hypothesis is Fleet pushing peer
+    # config updates mid-run briefly bounces connections. To distinguish
+    # "5 failures spread across 5 peers" from "5 failures all against ONE
+    # bad peer", PerPeerMaxIncrease below preserves the target_cluster
+    # label and reports the max-failure peer per focal cluster. If the two
+    # numbers match, failures are concentrated on a single peer (real
+    # peering issue); if PerPeerMaxIncrease ≈ 1 with MaxIncrease ≈ 5,
+    # failures are uniformly distributed (Fleet churn, not peering bug).
+    # See todo remote-cluster-failures-investigation.
     # ---------------------------------------------------------------------
     - Identifier: ClusterMeshRemoteClusterFailures{{$suffix}}
       Method: GenericPrometheusQuery
@@ -50,6 +61,16 @@ steps:
         queries:
         - name: MaxIncrease
           query: max(max_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) - min(min_over_time(cilium_clustermesh_remote_cluster_failures[%v:]))
+        # Max failures observed against any single peer cluster. Reported
+        # per scrape series (preserving target_cluster label inside the
+        # subquery), then we take the worst peer with quantile(0.99,...).
+        - name: PerPeerMaxIncrease
+          query: quantile(0.99, max_over_time(cilium_clustermesh_remote_cluster_failures[%v:]) - min_over_time(cilium_clustermesh_remote_cluster_failures[%v:]))
+        # Median peer's failure count — if this is also ≈ MaxIncrease, every
+        # peer is failing roughly equally; if it's near 0, failures are
+        # heavily concentrated on a few outlier peers.
+        - name: PerPeerMedianIncrease
+          query: quantile(0.50, max_over_time(cilium_clustermesh_remote_cluster_failures[%v:]) - min_over_time(cilium_clustermesh_remote_cluster_failures[%v:]))
 
     # ---------------------------------------------------------------------
     # Cross-cluster event throughput — the headline metric for scale scenario
@@ -65,67 +86,120 @@ steps:
         unit: events/s
         enableViolations: false
         queries:
+        # Subquery step explicitly set to 30s (matches Prometheus scrape
+        # interval) so brief workload-create bursts aren't smoothed away by
+        # the default 1m subquery step.
         - name: Perc99
-          query: quantile(0.99, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+          query: quantile(0.99, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:30s]))
         - name: Perc90
-          query: quantile(0.90, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+          query: quantile(0.90, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:30s]))
         - name: Perc50
-          query: quantile(0.50, avg_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+          query: quantile(0.50, avg_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:30s]))
+        # Cumulative event count over the run window. Range vector `[%v]`
+        # (NOT subquery `[%v:]`) — `increase()` with a subquery uses the
+        # subquery step to sample the counter, which at default 1m step
+        # misses brief bursts (events all fall between samples → first and
+        # last subquery samples both show post-burst peak count → delta=0).
+        # Range vector reads at Prometheus's actual scrape resolution.
+        - name: TotalIncrease
+          query: sum(increase(cilium_kvstoremesh_kvstore_events_queue_seconds_count[%v]))
 
     # ---------------------------------------------------------------------
-    # Per-type event rate breakdown (spec line 131: "Event rate (per
-    # type)"). The kvstoremesh kvstore-events histogram carries a
-    # `scope` label tagging which kvstore key family the event touched.
-    # We split into the three families spec line 5 calls out: endpoints,
-    # services, identities. Cilium 1.18 uses these scope values:
-    #   identities/v1     — security identities
-    #   services/v1       — global Service objects
-    #   ip/v1             — endpoint IP-to-identity mappings (endpoints)
-    #   nodes/v1          — node tunnel / IPAM advertisements
-    #   serviceexports/v1 — MCS-API ServiceExport objects
-    #   lease             — leader election
-    #   cilium/.heartbeat — kvstore liveness heartbeat
-    #   cilium/syncedcanaries — initial-sync barrier markers
-    # ---------------------------------------------------------------------
-    - Identifier: ClusterMeshKvstoreEventsRateIdentities{{$suffix}}
+    # Per-type cross-cluster events (spec line 5: "How many cross-cluster
+    # events (endpoints, services, identities) can be processed per cluster
+    # and per mesh"). Reports the cumulative count of kvstore events
+    # observed by THIS cluster's kvstoremesh during the test, broken down
+    # by scope label.
+    #
+    # Ground-truth scope values (verified via runtime probe on AKS-managed
+    # Cilium):
+    #   ip/v1           — endpoint (pod IP-to-identity) propagation events
+    #   services/v1     — global Service objects (incl. their backends)
+    #   identities/v1   — security identity additions/removals
+    #   nodes/v1        — node tunnel / IPAM advertisements
+    #   serviceexports/v1 — MCS-API ServiceExport (rare in our workload)
+    #   cilium/.hear*, cilium/synce*, cilium/.init*, lease — meta scopes
+    #     (heartbeat / synced canaries / init lock / leader election)
+    #
+    # Why instant `sum()` instead of `increase()` or `rate()`:
+    #   `cilium_kvstoremesh_kvstore_events_queue_seconds_count` is a
+    #   counter labelled by scope. In Prometheus convention a labelled
+    #   counter only EXISTS as a series once the labelled event has
+    #   occurred at least once. The per-scope events of interest
+    #   (services/v1, identities/v1, ip/v1, nodes/v1) only fire during
+    #   the workload-create burst at test start. Before the burst:
+    #   no series, no scrapes, no baseline. After the burst: counter
+    #   appears at the post-burst plateau value (e.g. 80) and stays
+    #   flat for the rest of the test. `increase(metric[%v])` over a
+    #   series whose first sample IS the plateau cannot compute a delta
+    #   to a non-existent pre-burst sample, so it returns 0.
+    #
+    #   We tried two workarounds (commit history) before settling on
+    #   instant `sum()`:
+    #     - Tightening the subquery step from default 1m to 30s: didn't
+    #       help — still no pre-burst sample.
+    #     - Adding a 90s pre-workload settle step (commit 380d34c): didn't
+    #       help — Prometheus had time to discover the PodMonitor target,
+    #       but the per-scope SERIES still didn't exist until the burst.
+    #
+    #   Since each test run uses freshly-provisioned clusters (counter
+    #   starts at 0), CurrentValue at gather time IS the cumulative count
+    #   of events observed during this test. That directly answers spec
+    #   line 5's "How many events" wording.
+    #
+    #   The aggregate `ClusterMeshKvstoreEventsRate` query above DOES
+    #   work because the heartbeat scope (`cilium/.hear*`) increments
+    #   every ~5s from cluster bring-up onward — so Prometheus has many
+    #   pre-burst samples for the aggregate vector to compute rate over.
+    #
+    #   For per-scope rate signal (events/sec), Cilium would need to
+    #   pre-emit zero-valued counters for known scopes at startup, which
+    #   it doesn't do today (would require an upstream PR).
+    # ---------------------------------------------------------------------
+    - Identifier: ClusterMeshKvstoreEventsTotalIdentities{{$suffix}}
       Method: GenericPrometheusQuery
       Params:
         action: {{$action}}
-        metricName: ClusterMesh Kvstore Events Rate Identities {{$suffix}}
+        metricName: ClusterMesh Kvstore Events Total Identities {{$suffix}}
         metricVersion: v1
-        unit: events/s
+        unit: events
         enableViolations: false
         queries:
-        - name: Perc99
-          query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:]))
-        - name: Perc50
-          query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:]))
-    - Identifier: ClusterMeshKvstoreEventsRateServices{{$suffix}}
+        - name: TotalCount
+          query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"})
+    - Identifier: ClusterMeshKvstoreEventsTotalServices{{$suffix}}
       Method: GenericPrometheusQuery
       Params:
         action: {{$action}}
-        metricName: ClusterMesh Kvstore Events Rate Services {{$suffix}}
+        metricName: ClusterMesh Kvstore Events Total Services {{$suffix}}
         metricVersion: v1
-        unit: events/s
+        unit: events
         enableViolations: false
         queries:
-        - name: Perc99
-          query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:]))
-        - name: Perc50
-          query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:]))
-    - Identifier: ClusterMeshKvstoreEventsRateEndpoints{{$suffix}}
+        - name: TotalCount
+          query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"})
+    - Identifier: ClusterMeshKvstoreEventsTotalEndpoints{{$suffix}}
       Method: GenericPrometheusQuery
       Params:
         action: {{$action}}
-        metricName: ClusterMesh Kvstore Events Rate Endpoints {{$suffix}}
+        metricName: ClusterMesh Kvstore Events Total Endpoints {{$suffix}}
         metricVersion: v1
-        unit: events/s
+        unit: events
         enableViolations: false
         queries:
-        - name: Perc99
-          query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:]))
-        - name: Perc50
-          query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:]))
+        - name: TotalCount
+          query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"})
+    - Identifier: ClusterMeshKvstoreEventsTotalNodes{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh Kvstore Events Total Nodes {{$suffix}}
+        metricVersion: v1
+        unit: events
+        enableViolations: false
+        queries:
+        - name: TotalCount
+          query: sum(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="nodes/v1"})
 
     # ---------------------------------------------------------------------
     # Cross-cluster propagation latency proxy: p99 of kvstore operation
@@ -190,3 +264,61 @@ steps:
           query: quantile(0.99, max_over_time(cilium_identity[%v:]))
         - name: Perc50
           query: quantile(0.50, avg_over_time(cilium_identity[%v:]))
+
+    # ---------------------------------------------------------------------
+    # Scenario #7 (HA Configuration) — clustermesh-apiserver pod resource
+    # overhead. With replicas=1 (baseline scenarios #1-#6) the Total metrics
+    # equal the single-pod values; with replicas=N (scenario #7 / ha-config)
+    # they reflect the cumulative cost of N replicas. Direct A/B in Kusto:
+    # compare `test_type in ("apiserver-failure","ha-config")` rows.
+    #
+    # Scoped to label `pod=~"clustermesh-apiserver-.*"` which matches every
+    # pod under the Deployment (ReplicaSet hash + suffix). Source is cAdvisor
+    # (kubelet metrics), which the CL2 prometheus stack scrapes by default.
+    # ---------------------------------------------------------------------
+    - Identifier: ClusterMeshApiserverPodCPU{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh APIServer Pod CPU {{$suffix}}
+        metricVersion: v1
+        unit: cpu
+        enableViolations: false
+        queries:
+        - name: TotalMax
+          query: max_over_time(sum(rate(container_cpu_usage_seconds_total{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}[1m]))[%v:])
+        - name: TotalAvg
+          query: avg_over_time(sum(rate(container_cpu_usage_seconds_total{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}[1m]))[%v:])
+        - name: PerPodMax
+          query: max_over_time(max(sum by (pod) (rate(container_cpu_usage_seconds_total{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}[1m])))[%v:])
+
+    - Identifier: ClusterMeshApiserverPodMemory{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh APIServer Pod Memory {{$suffix}}
+        metricVersion: v1
+        unit: bytes
+        enableViolations: false
+        queries:
+        - name: TotalMax
+          query: max_over_time(sum(container_memory_working_set_bytes{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"})[%v:])
+        - name: TotalAvg
+          query: avg_over_time(sum(container_memory_working_set_bytes{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"})[%v:])
+        - name: PerPodMax
+          query: max_over_time(max(sum by (pod) (container_memory_working_set_bytes{pod=~"clustermesh-apiserver-.*", container!="", container!="POD"}))[%v:])
+
+    - Identifier: ClusterMeshApiserverPodRestarts{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh APIServer Pod Restarts {{$suffix}}
+        metricVersion: v1
+        unit: "#"
+        enableViolations: false
+        queries:
+        - name: Total
+          query: max_over_time(sum(kube_pod_container_status_restarts_total{pod=~"clustermesh-apiserver-.*"})[%v:])
+        - name: PerPodMax
+          query: max_over_time(max(sum by (pod) (kube_pod_container_status_restarts_total{pod=~"clustermesh-apiserver-.*"}))[%v:])
+
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml
index 47504cbf89..d74b9992d6 100644
--- a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml
@@ -54,6 +54,51 @@ steps:
             query: quantile(0.90, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
           - name: Perc50
             query: quantile(0.50, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
+      # ---------------------------------------------------------------------
+      # Per-pod normalized apiserver CPU. The two ApiserverAvg/MaxCPUUsage
+      # measurements above use the team-wide shared PromQL pattern (copied
+      # across large_cluster / network-scale / slo / network-load) which
+      # implicitly aggregates across whatever series match
+      # `endpoint="apiserver"` — so the resulting "cores" value is actually
+      # a Prometheus rate aggregate, not literal cores per pod.
+      #
+      # This duplicate measurement adds explicit `sum by(pod)` grouping so
+      # we get a per-pod value (i.e. genuine cores) AND `quantile(0.99)`
+      # then picks the most-loaded pod. If the underlying scrape doesn't
+      # carry a `pod` label, sum-by collapses to one series and the
+      # measurement still yields a usable cross-cluster number.
+      #
+      # Kept SEPARATE from the shared-pattern measurements so dashboards
+      # comparing across scenarios still see the same column names from
+      # the originals; we just gain an honest per-pod column on top.
+      # See todo apiserver-cpu-promql-fix.
+      # ---------------------------------------------------------------------
+      - Identifier: ApiserverAvgCPUPerPod{{$suffix}}
+        Method: GenericPrometheusQuery
+        Params:
+          action: {{$action}}
+          metricName: Apiserver Avg CPU Per Pod {{$suffix}}
+          metricVersion: v1
+          unit: cores
+          enableViolations: false
+          queries:
+          - name: Perc99
+            query: quantile(0.99, sum by(pod) (avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])))
+          - name: Perc50
+            query: quantile(0.50, sum by(pod) (avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])))
+      - Identifier: ApiserverMaxCPUPerPod{{$suffix}}
+        Method: GenericPrometheusQuery
+        Params:
+          action: {{$action}}
+          metricName: Apiserver Max CPU Per Pod {{$suffix}}
+          metricVersion: v1
+          unit: cores
+          enableViolations: false
+          queries:
+          - name: Perc99
+            query: quantile(0.99, sum by(pod) (max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])))
+          - name: Perc50
+            query: quantile(0.50, sum by(pod) (max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])))
       - Identifier: ApiserverAvgMemUsage{{$suffix}}
         Method: GenericPrometheusQuery
         Params:
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/node-churn.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/node-churn.yaml
new file mode 100644
index 0000000000..369982624c
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/node-churn.yaml
@@ -0,0 +1,185 @@
+{{$action := .action}} # start, gather
+
+{{$suffix := DefaultParam .suffix ""}}
+
+# Scale scenario #3 (Node Churn / IP Churn) — measurements layered on top
+# of clustermesh-metrics.yaml + cilium.yaml. These queries surface the
+# spec-required signals (scale testing.txt:78-79):
+#
+#   * IP update propagation — kvstore event rates broken out by scope so
+#     node/IP scope events are visible separately from identity/service
+#     scope. Under node-churn, node-scope events should burst when nodes
+#     drain/replace; identity-scope events should stay flat (identity is
+#     label-keyed, not IP-keyed).
+#   * Temporary inconsistency windows — node Ready transitions, pod
+#     eviction rate, remote-cluster endpoint cardinality on peers
+#     (whether peers observe the target's IP churn fully).
+#
+# Rubber-duck design review #5 + #6: cilium_identity_count is a weak
+# signal under node-churn (identities don't churn when only IPs change).
+# Dropped in favor of kvstore-scope rates + remote endpoint cardinality.
+
+steps:
+  - name: {{$action}} Node Churn Measurements
+    measurements:
+
+    # -----------------------------------------------------------------
+    # NODE READY TRANSITIONS. changes() over a counter-like series of
+    # node-condition states counts the number of Ready/NotReady flips
+    # during the window. Healthy scale-cycle: 2N transitions per cycle
+    # (N nodes drain + N nodes ready). Replace: ≥ K (drained + new).
+    # Spec line 79 "Temporary inconsistency windows": this is the
+    # local-cluster view of how long nodes stayed un-Ready.
+    # -----------------------------------------------------------------
+    - Identifier: NodeReadyTransitions{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Node Ready Transitions {{$suffix}}
+        metricVersion: v1
+        unit: count
+        enableViolations: false
+        queries:
+        - name: ReadyTransitionsTotal
+          query: sum(changes(kube_node_status_condition{condition="Ready",status="true"}[%v:]))
+        - name: NotReadyTransitionsTotal
+          query: sum(changes(kube_node_status_condition{condition="Ready",status="false"}[%v:]))
+
+    # -----------------------------------------------------------------
+    # NODE CARDINALITY OVER TIME — gauge for node-info series counts the
+    # nodes visible to kube-state-metrics. min/max over the window flag
+    # the scaling delta (e.g., max=25 vs min=20 → +5 scale-up observed).
+    # NodeCount must trend back to OriginalCount by gather time (the
+    # finalizer guarantees it on target; peers see only their own static
+    # pool unaffected by target's churn).
+    # -----------------------------------------------------------------
+    - Identifier: NodeCardinality{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Node Cardinality {{$suffix}}
+        metricVersion: v1
+        unit: count
+        enableViolations: false
+        queries:
+        - name: Min
+          query: min_over_time(count(kube_node_info)[%v:])
+        - name: Max
+          query: max_over_time(count(kube_node_info)[%v:])
+        - name: Last
+          query: count(kube_node_info)
+
+    # -----------------------------------------------------------------
+    # POD EVICTION / RESCHEDULE RATE. Pods on a drained or deleted node
+    # get NodeLost (kubelet evicts) or Evicted (kube-controller forcibly
+    # rescheduled). Rate over the window: target should spike during
+    # ops; peers stay near 0 (no node churn there).
+    # -----------------------------------------------------------------
+    - Identifier: PodEvictionRate{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Pod Eviction Rate {{$suffix}}
+        metricVersion: v1
+        unit: events/s
+        enableViolations: false
+        queries:
+        - name: NodeLostMax
+          query: max(max_over_time(rate(kube_pod_status_reason{reason="NodeLost"}[1m])[%v:]))
+        - name: EvictedMax
+          query: max(max_over_time(rate(kube_pod_status_reason{reason="Evicted"}[1m])[%v:]))
+
+    # -----------------------------------------------------------------
+    # KVSTORE EVENT RATES BY SCOPE — the headline propagation signal.
+    # cilium_kvstoremesh_kvstore_events_queue_seconds_count carries a
+    # `scope` label (verified runtime-probed in Phase 2: nodes/v1, ip/v1,
+    # identities/v1, endpoints/v1, services/v1).
+    #
+    # Under node-churn the EXPECTED splits are:
+    #   nodes/v1     → burst (each scale/replace op churns N node entries)
+    #   ip/v1        → burst (each new VM gets a new IP entry)
+    #   identities/v1→ near-zero (workload pods keep same labels)
+    #   endpoints/v1 → burst (pods reschedule with new pod IPs)
+    #   services/v1  → near-zero (service definitions stable)
+    #
+    # Cross-scenario Kusto query: filter by scope, compare target vs peer
+    # rate. Peer rates indicate "did target's node churn propagate to
+    # peers' kvstore" — the spec "IP update propagation" signal.
+    # -----------------------------------------------------------------
+    - Identifier: KvstoreNodeScopeEventRate{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Kvstore Node Scope Event Rate {{$suffix}}
+        metricVersion: v1
+        unit: events/s
+        enableViolations: false
+        queries:
+        - name: Max
+          query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="nodes/v1"}[1m])[%v:]))
+        - name: Perc50
+          query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="nodes/v1"}[1m])[%v:]))
+
+    - Identifier: KvstoreIpScopeEventRate{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Kvstore IP Scope Event Rate {{$suffix}}
+        metricVersion: v1
+        unit: events/s
+        enableViolations: false
+        queries:
+        - name: Max
+          query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m])[%v:]))
+        - name: Perc50
+          query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m])[%v:]))
+
+    - Identifier: KvstoreEndpointsScopeEventRate{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Kvstore Endpoints Scope Event Rate {{$suffix}}
+        metricVersion: v1
+        unit: events/s
+        enableViolations: false
+        queries:
+        - name: Max
+          query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="endpoints/v1"}[1m])[%v:]))
+        - name: Perc50
+          query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="endpoints/v1"}[1m])[%v:]))
+
+    # -----------------------------------------------------------------
+    # REMOTE-CLUSTER ENDPOINT CARDINALITY. cilium_clustermesh_remote_cluster_*
+    # tracks per-peer state from THIS cluster's perspective. On peers
+    # during target's node-churn:
+    #   - remote_cluster_nodes_total → fluctuates (target's node count
+    #     changes) → min/max delta proves propagation reached peer
+    #   - remote_cluster_endpoints_total → fluctuates (pod rescheduling
+    #     during target's node churn)
+    #
+    # Spec "IP update propagation" — if the peer-side delta is zero
+    # while target's local kvstore events show burst, propagation is
+    # broken or stale.
+    # -----------------------------------------------------------------
+    - Identifier: RemoteClusterNodesCardinality{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Remote Cluster Nodes Cardinality {{$suffix}}
+        metricVersion: v1
+        unit: count
+        enableViolations: false
+        queries:
+        - name: Min
+          query: min(min_over_time(cilium_clustermesh_remote_cluster_nodes[%v:]))
+        - name: Max
+          query: max(max_over_time(cilium_clustermesh_remote_cluster_nodes[%v:]))
+        - name: Last
+          query: max(cilium_clustermesh_remote_cluster_nodes)
+
+    # -----------------------------------------------------------------
+    # NewNodesAppearedInWindow REMOVED 2026-05-14: build 67114 showed
+    # CL2's %v substitution produces a duration literal ("2309s") which
+    # PromQL rejects in scalar `<` comparison. The signal is redundant
+    # with NodeCardinality (Max - Min) above + the authoritative pre/post
+    # InternalIP set delta in NodeChurnTimings_*.json.
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/pod-churn-stress.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/pod-churn-stress.yaml
new file mode 100644
index 0000000000..8159fd6681
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/pod-churn-stress.yaml
@@ -0,0 +1,122 @@
+{{$action := .action}} # start, gather
+
+{{$suffix := DefaultParam .suffix ""}}
+
+# Pod-Churn Stress Test (scale scenario #2) — slope-over-time / sustained-rate
+# measurements layered on top of clustermesh-metrics.yaml. These queries
+# surface the "growth over time" signals that point-in-time percentiles
+# can hide:
+#
+#   * Memory drift: positive nonzero value over a 10-minute churn window
+#     suggests a leak or unbounded queue. Compared head-to-head with a
+#     no-churn baseline run.
+#   * Sustained event-queue rate: max-over-time of a 1m-sliding rate. If
+#     this stays elevated while drift is positive, kvstore is falling
+#     behind the churn.
+#   * Remote-cluster failure rate: how fast does this monotonic counter
+#     accumulate under sustained churn? rate() is the counter-safe
+#     primitive (deriv() mishandles counter resets per the Prometheus
+#     docs; the rubber-duck design review caught this).
+
+steps:
+  - name: {{$action}} Pod Churn Stress Measurements
+    measurements:
+    # -----------------------------------------------------------------
+    # Cilium-agent memory drift — leak detection. Two flavors:
+    #   MaxPodDeriv: worst single agent series. Flags an outlier node.
+    #   SumDeriv:    total per-cluster memory growth across all agents.
+    #                This is the "per-cluster footprint" number — what
+    #                the scaling-curve dashboard uses.
+    # deriv() returns bytes/sec; we present as MB/s for readability.
+    # cilium_process_resident_memory_bytes is a gauge, so deriv() is
+    # well-defined (handles negative slopes correctly).
+    # -----------------------------------------------------------------
+    - Identifier: CiliumAgentMemoryDrift{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Cilium Agent Memory Drift {{$suffix}}
+        metricVersion: v1
+        unit: MB/s
+        enableViolations: false
+        queries:
+        - name: MaxPodDeriv
+          query: max(deriv(cilium_process_resident_memory_bytes[%v:])) / 1024 / 1024
+        - name: SumDeriv
+          query: sum(deriv(cilium_process_resident_memory_bytes[%v:])) / 1024 / 1024
+        - name: Perc50PodDeriv
+          query: quantile(0.50, deriv(cilium_process_resident_memory_bytes[%v:])) / 1024 / 1024
+
+    # -----------------------------------------------------------------
+    # clustermesh-apiserver memory drift — same idea, different process.
+    # Uses cAdvisor's container_memory_working_set_bytes (no cilium-side
+    # gauge for the apiserver pod exists). Filters per the design review:
+    #   namespace=kube-system  pins to the AKS-managed Cilium deployment
+    #                          (avoid duplicate scrapes from a future
+    #                          customer-installed Cilium in another ns).
+    #   container!=""         drops cAdvisor's per-pod aggregate row
+    #                          (empty container label).
+    #   container!="POD"      drops the pause container's own series.
+    # -----------------------------------------------------------------
+    - Identifier: ClustermeshApiserverMemoryDrift{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh Apiserver Memory Drift {{$suffix}}
+        metricVersion: v1
+        unit: MB/s
+        enableViolations: false
+        queries:
+        - name: MaxContainerDeriv
+          query: max(deriv(container_memory_working_set_bytes{namespace="kube-system",pod=~"clustermesh-apiserver-.*",container!="",container!="POD"}[%v:])) / 1024 / 1024
+        - name: SumDeriv
+          query: sum(deriv(container_memory_working_set_bytes{namespace="kube-system",pod=~"clustermesh-apiserver-.*",container!="",container!="POD"}[%v:])) / 1024 / 1024
+
+    # -----------------------------------------------------------------
+    # Sustained kvstore event-queue rate. The headline saturation signal
+    # for sustained churn — if this stays high across the run while
+    # MemoryDrift is positive, the system is queueing faster than it's
+    # draining.
+    #
+    # cilium_kvstoremesh_kvstore_events_queue_seconds_count is a counter
+    # (cumulative count of queued events) — must use rate(), not deriv().
+    # max_over_time of a 1m-sliding rate gives "worst sustained burst" —
+    # spike-tolerant unlike a point sample.
+    # -----------------------------------------------------------------
+    - Identifier: SustainedKvstoreEventRate{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: Sustained Kvstore Event Rate {{$suffix}}
+        metricVersion: v1
+        unit: events/s
+        enableViolations: false
+        queries:
+        - name: Max
+          query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+        - name: Perc50
+          query: quantile(0.50, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+
+    # -----------------------------------------------------------------
+    # Remote-cluster failure rate. cilium_clustermesh_remote_cluster_failures
+    # is a monotonic counter — accumulated reconnect failures from this
+    # cluster's perspective. Under sustained churn the spec line 65
+    # "missed or delayed updates" signal is whether this rate climbs
+    # above the baseline of ~4-6/run observed on green N=20 runs (see
+    # plan.md "Decisions deliberately deferred" item 6).
+    #
+    # rate() handles counter resets correctly; deriv() does not.
+    # -----------------------------------------------------------------
+    - Identifier: RemoteClusterFailureRate{{$suffix}}
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ClusterMesh Remote Cluster Failure Rate {{$suffix}}
+        metricVersion: v1
+        unit: failures/s
+        enableViolations: false
+        queries:
+        - name: Max
+          query: max(max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:]))
+        - name: Perc50
+          query: quantile(0.50, max_over_time(rate(cilium_clustermesh_remote_cluster_failures[1m])[%v:]))
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrole.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrole.yaml
new file mode 100644
index 0000000000..df3c40e1a4
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrole.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{.Name}}
+  labels:
+    group: {{.Group}}
+rules:
+  # Minimum verbs needed by the killer script: list to enumerate workload pods
+  # across namespaces, delete to terminate them, get is required by some
+  # kubectl operations for richer error reporting.
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "delete"]
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrolebinding.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrolebinding.yaml
new file mode 100644
index 0000000000..7f36cc58b7
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-clusterrolebinding.yaml
@@ -0,0 +1,14 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{.Name}}
+  labels:
+    group: {{.Group}}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{.RoleName}}
+subjects:
+  - kind: ServiceAccount
+    name: {{.SAName}}
+    namespace: {{.SANamespace}}
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-job.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-job.yaml
new file mode 100644
index 0000000000..4984f6f72d
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-job.yaml
@@ -0,0 +1,107 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{.Name}}
+  labels:
+    group: {{.Group}}
+spec:
+  # Never restart on failure — if the killer crashes we want loud junit
+  # failure, not silent retry that backoffs past the measurement window.
+  # backoffLimit:0 plus restartPolicy:Never together ensure exactly one
+  # attempt.
+  backoffLimit: 0
+  # Job has its own deadline as a defense-in-depth bound: even if the
+  # in-script `while` loop never terminates for some reason, the Job
+  # controller kills the pod at killDuration + 60s buffer.
+  activeDeadlineSeconds: {{.ActiveDeadlineSeconds}}
+  template:
+    metadata:
+      labels:
+        group: {{.Group}}
+        app: {{.Name}}
+    spec:
+      serviceAccountName: {{.SAName}}
+      restartPolicy: Never
+      # Short grace period: the killer's signal handler exits immediately;
+      # nothing in the script needs to flush state.
+      terminationGracePeriodSeconds: 5
+      containers:
+        - name: killer
+          image: {{.Image}}
+          # bitnami/kubectl ships kubectl + bash + coreutils (shuf, xargs,
+          # cut, date) which the kill loop depends on. Verified by inspection
+          # of telescope-upstream/modules/kustomize/fio/.../ds.yaml usage.
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              set -o pipefail
+              # Graceful shutdown: SIGTERM from the Job controller (delete or
+              # activeDeadlineSeconds) lands here. We exit 0 so the Job is
+              # marked Succeeded — the rubber-duck critique called out that
+              # an in-flight 143 exit would mark the Job Failed and trigger
+              # junit error.
+              trap 'echo "killer: received SIGTERM, exiting"; exit 0' TERM INT
+
+              KILL_DURATION_SECONDS="${KILL_DURATION_SECONDS:-600}"
+              KILL_INTERVAL_SECONDS="${KILL_INTERVAL_SECONDS:-10}"
+              KILL_BATCH="${KILL_BATCH:-5}"
+              LABEL_SELECTOR="${LABEL_SELECTOR:-group=clustermesh-pod-churn-kill}"
+
+              echo "killer: starting (duration=${KILL_DURATION_SECONDS}s interval=${KILL_INTERVAL_SECONDS}s batch=${KILL_BATCH} selector=${LABEL_SELECTOR})"
+
+              END_EPOCH=$(( $(date +%s) + KILL_DURATION_SECONDS ))
+              ROUND=0
+              KILLED_TOTAL=0
+              while [ "$(date +%s)" -lt "$END_EPOCH" ]; do
+                ROUND=$((ROUND + 1))
+                # List candidate pods cluster-wide matching the label
+                # selector. -o name yields `pod/<name>` per line; we strip
+                # the prefix and prepend the namespace via go-template.
+                # Random selection: shuf | head -n. shuf gracefully returns
+                # fewer than batch when the pool is small (mid-cycle when
+                # ReplicaSet has not yet replaced previous kills).
+                mapfile -t TARGETS < <(
+                  kubectl get pods -A -l "$LABEL_SELECTOR" \
+                    -o jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' \
+                  | shuf | head -n "$KILL_BATCH"
+                )
+
+                if [ "${#TARGETS[@]}" -eq 0 ]; then
+                  echo "killer: round=${ROUND} no candidates matched selector ${LABEL_SELECTOR}"
+                else
+                  for nsname in "${TARGETS[@]}"; do
+                    ns="${nsname%%/*}"
+                    name="${nsname##*/}"
+                    # --grace-period=0 + --force: immediate evict, no graceful
+                    # shutdown wait. Realistic "node failure"-style event for
+                    # the pod-event propagation path.
+                    if kubectl delete pod -n "$ns" "$name" \
+                        --grace-period=0 --force --ignore-not-found \
+                        > /dev/null 2>&1; then
+                      KILLED_TOTAL=$((KILLED_TOTAL + 1))
+                    fi
+                  done
+                  echo "killer: round=${ROUND} killed=${#TARGETS[@]} cumulative=${KILLED_TOTAL}"
+                fi
+
+                sleep "$KILL_INTERVAL_SECONDS"
+              done
+
+              echo "killer: done duration=${KILL_DURATION_SECONDS}s rounds=${ROUND} cumulative=${KILLED_TOTAL}"
+              exit 0
+          env:
+            - name: KILL_DURATION_SECONDS
+              value: "{{.KillDurationSeconds}}"
+            - name: KILL_INTERVAL_SECONDS
+              value: "{{.KillIntervalSeconds}}"
+            - name: KILL_BATCH
+              value: "{{.KillBatch}}"
+            - name: LABEL_SELECTOR
+              value: "{{.WorkloadLabelSelector}}"
+          resources:
+            requests:
+              cpu: 50m
+              memory: 64Mi
+            limits:
+              cpu: 500m
+              memory: 256Mi
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-sa.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-sa.yaml
new file mode 100644
index 0000000000..d56aed2810
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-killer-sa.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{.Name}}
+  labels:
+    group: {{.Group}}
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-workload.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-workload.yaml
new file mode 100644
index 0000000000..a9229e51f2
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/pod-churn-workload.yaml
@@ -0,0 +1,52 @@
+name: clustermesh-pod-churn-workload
+
+# Workload module shared by both pod-churn scenarios (#2 from scale testing.txt):
+#   - pod-churn-scale.yaml: deterministic scale-cycle (replicas N → 0 → N → ...).
+#   - pod-churn-kill.yaml:  in-cluster random pod deletion via a killer Job.
+#
+# Per the rubber-duck critique on the Phase 4a design: we KEEP the Deployment
+# and Service object count constant (replicasPerNamespace = deploymentsPerNamespace
+# every invocation) and ONLY vary `.spec.replicas` on the underlying Deployment
+# via templateFillMap.Replicas. Setting replicasPerNamespace=0 here would DELETE
+# the Deployment+Service pair, which churns service-propagation events in
+# addition to pod events and changes the scenario semantics. The teardown
+# scenario explicitly opts into deletion via actionName=delete (which is what
+# CL2's `phases` with replicasPerNamespace=0 in the caller produces).
+
+{{$actionName := .actionName}}      # apply | delete
+{{$replicas := DefaultParam .replicas 0}}
+{{$namespaces := .namespaces}}
+{{$deploymentsPerNamespace := .deploymentsPerNamespace}}
+{{$tuningSet := .tuningSet}}
+{{$group := DefaultParam .group "clustermesh-pod-churn"}}
+{{$basename := DefaultParam .basename "pc"}}
+
+# delete = drop objects entirely (teardown only).
+# apply  = keep object count constant, set Deployment .spec.replicas to $replicas.
+{{$objectsPerNamespace := $deploymentsPerNamespace}}
+{{if eq $actionName "delete"}}{{$objectsPerNamespace = 0}}{{end}}
+
+steps:
+  - name: {{$actionName}} pod-churn workload (replicas={{$replicas}})
+    phases:
+      - namespaceRange:
+          min: 1
+          max: {{$namespaces}}
+        replicasPerNamespace: {{$objectsPerNamespace}}
+        tuningSet: {{$tuningSet}}
+        objectBundle:
+          - basename: {{$basename}}
+            objectTemplatePath: /modules/event-throughput-deployment.yaml
+            templateFillMap:
+              # Pod count per Deployment is what cycles between $replicasPerDeployment
+              # and 0 during the scale-cycle scenario. The Deployment object itself
+              # is reapplied (PATCHed) by CL2 every invocation — ReplicaSet generation
+              # stays stable across replica changes because .spec.template is not
+              # being modified (no rolling restart).
+              Replicas: {{$replicas}}
+              Group: {{$group}}
+              RestartGeneration: 0
+          - basename: {{$basename}}
+            objectTemplatePath: /modules/event-throughput-service.yaml
+            templateFillMap:
+              Group: {{$group}}
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churn-combined.yaml b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-combined.yaml
new file mode 100644
index 0000000000..e5649e4c73
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-combined.yaml
@@ -0,0 +1,221 @@
+name: clustermesh-node-churn-combined
+
+# Scale scenario #3 (Node Churn / IP Churn) — combined flavor.
+#
+# Both spec stimuli (scale + replace) driven serially by the SAME
+# host-side node-churner.sh invocation (mode=node-churn-combined),
+# against the same provisioned clusters. Used for share-infra runs to
+# maximize signal per expensive n=20 provision lifecycle.
+#
+# Sequence on the host (executed by node-churner.sh):
+#   1. Wait for ready-sentinels from all clusters.
+#   2. Run scale phase ($NODE_CHURN_CYCLES cycles of ±$NODE_CHURN_DELTA).
+#   3. Settle $NODE_CHURN_SETTLE_SECONDS.
+#   4. Run replace phase (drain + VMSS delete K instances, wait refill).
+#   5. EXIT trap restores pool to original_node_count.
+#
+# CL2-side behavior is identical to node-churn-scale.yaml /
+# node-churn-replace.yaml — workload deploy + ready-sentinel + sleep +
+# gather — but with a longer sleep window equal to scale + replace
+# phase walltimes summed plus settle margin.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}}
+
+# Default 3300s = 55min: 30min scale phase + 25min replace phase + margin.
+{{$combinedDurationSeconds := DefaultParam .CL2_NODE_CHURN_COMBINED_DURATION_SECONDS 3300}}
+
+{{$group := "clustermesh-node-churn-combined"}}
+{{$basename := "ncc"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-ncc
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-ncc"
+
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/node-churn.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  - name: Start tracking node-churn-combined Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-combined
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - name: Wait for initial node-churn-combined pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-combined
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before node-churn-combined stimulus window
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  - name: Signal CL2 ready to host-side node-churner
+    measurements:
+      - Identifier: NodeChurnReadySentinel
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 30s
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh
+          - /root/perf-tests/clusterloader2/config/sentinels
+
+  - name: Observe node-churn-combined stimulus window
+    measurements:
+      - Identifier: NodeChurnObservationSleep
+        Method: Sleep
+        Params:
+          duration: {{$combinedDurationSeconds}}s
+
+  - name: Wait for post-node-churn-combined pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-combined-final
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  - name: Final wait for pods to converge after node-churn-combined
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-combined-final
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Settle after node-churn-combined
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/node-churn.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churn-replace.yaml b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-replace.yaml
new file mode 100644
index 0000000000..58a27c2cd5
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-replace.yaml
@@ -0,0 +1,228 @@
+name: clustermesh-node-churn-replace
+
+# Scale scenario #3 (Node Churn / IP Churn) — node-replacement flavor.
+#
+# Spec mapping (scale testing.txt:68-79):
+#   * "Node replacement (new IPs)" / "Force node recreation" → this file.
+#   * "Node scale-up/scale-down" / "Add/remove nodes continuously" → node-churn-scale.yaml.
+#
+# Stimulus mechanism: host-side node-churner.sh DRAINS K nodes (via kubectl)
+# then DELETES their VMSS instances (via `az vmss delete-instances`). AKS
+# nodepool desired-count stays fixed (auto_scaling_enabled=false) so VMSS
+# auto-replaces deleted instances with brand-new VMs that get brand-new
+# private IPs. Result: K nodes effectively replaced with new identity +
+# new IPs, same total count. Pre/post InternalIP snapshots in the timing
+# JSON let Kusto verify the IP set actually churned.
+#
+# Why VMSS delete-instances rather than `az aks nodepool upgrade --node-image-only`:
+# rubber-duck design review #2 — the upgrade short-circuits as a no-op
+# when the node image is already current, producing zero IP churn signal.
+# VMSS instance delete is mechanism-pure: deleted = gone, replacement =
+# new VM with new private IP, every time.
+#
+# CL2-side behavior is symmetric with node-churn-scale: every cluster
+# deploys workload, signals ready-sentinel, sleeps for
+# CL2_NODE_CHURN_REPLACE_DURATION_SECONDS, gathers. See node-churn-scale.yaml
+# for the per-step rationale.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}}
+
+# Default 1500s = 25min covers VMSS delete-and-refill for K=10 instances
+# in parallel: each drain ≤ 5min + parallel VMSS provisioning ≤ 15min.
+{{$replaceDurationSeconds := DefaultParam .CL2_NODE_CHURN_REPLACE_DURATION_SECONDS 1500}}
+
+{{$group := "clustermesh-node-churn-replace"}}
+{{$basename := "ncr"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-ncr
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-ncr"
+
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/node-churn.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  - name: Start tracking node-churn-replace Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-replace
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - name: Wait for initial node-churn-replace pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-replace
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before node-churn-replace stimulus window
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  - name: Signal CL2 ready to host-side node-churner
+    measurements:
+      - Identifier: NodeChurnReadySentinel
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 30s
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh
+          - /root/perf-tests/clusterloader2/config/sentinels
+
+  - name: Observe node-churn-replace stimulus window
+    measurements:
+      - Identifier: NodeChurnObservationSleep
+        Method: Sleep
+        Params:
+          duration: {{$replaceDurationSeconds}}s
+
+  - name: Wait for post-node-churn-replace pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-replace-final
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  - name: Final wait for pods to converge after node-churn-replace
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-replace-final
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Settle after node-churn-replace
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/node-churn.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churn-scale.yaml b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-scale.yaml
new file mode 100644
index 0000000000..62ae135801
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churn-scale.yaml
@@ -0,0 +1,248 @@
+name: clustermesh-node-churn-scale
+
+# Scale scenario #3 (Node Churn / IP Churn) — scale-cycle flavor.
+#
+# Spec mapping (scale testing.txt:68-79):
+#   * "Node scale-up/scale-down" / "Add/remove nodes continuously" → this file.
+#   * "Node replacement (new IPs)" / "Force node recreation" → node-churn-replace.yaml.
+#
+# CRITICAL: the actual node-scaling stimulus is driven OUTSIDE CL2 by
+# node-churner.sh (launched from steps/engine/clusterloader2/clustermesh-scale/execute.yml
+# as a background subshell on the AzDO agent). Reason: the CL2 docker image
+# (ghcr.io/azure/clusterloader2) has no `az` CLI and we don't control its
+# build. Every cluster's CL2 just deploys a baseline pod workload, registers
+# measurements, writes a ready-sentinel, then SLEEPS for
+# CL2_NODE_CHURN_SCALE_DURATION_SECONDS — long enough for the churner to do
+# its work + a settle window. After the sleep, gather + teardown.
+#
+# Per-cluster ready-sentinel:
+# The "Signal ready to host churner" step writes
+# /root/perf-tests/clusterloader2/config/sentinels/ready-<context> via
+# Method:Exec. The host-side node-churner.sh polls this dir for
+# $cluster_count sentinels before firing its first nodepool op. Without
+# this barrier, the churner could fire before peers' Prometheus is
+# scraping — losing the propagation signal (rubber-duck design review #1).
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}}
+
+# Sleep window — must be ≥ host-side churner's expected wall time.
+# Default 1800s = 30min covers 3 cycles × 2 ops × ~4min = 24min churner +
+# settle margin. Per-tier overrides via matrix var
+# node_churn_scale_duration_seconds (auto-exported).
+{{$scaleDurationSeconds := DefaultParam .CL2_NODE_CHURN_SCALE_DURATION_SECONDS 1800}}
+
+{{$group := "clustermesh-node-churn-scale"}}
+{{$basename := "ncs"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-ncs
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  # ----- ACNS namespace opt-in (CFP-39876) -----
+  # See pod-churn-scale.yaml header for full context. Without this,
+  # cross-cluster identity/endpoint propagation is structurally 0.
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-ncs"
+
+  # ----- Start measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/node-churn.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  # ----- Workload deploy: pause pods spread across nodes so node churn -----
+  # ----- naturally evicts a representative sample. topologySpread comes  -----
+  # ----- from pod-churn-workload.yaml's default Deployment shape (NOT a  -----
+  # ----- new module) — rubber-duck #8 noted distribution risk but the    -----
+  # ----- reused workload template already has it.                       -----
+  - name: Start tracking node-churn-scale Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-scale
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - name: Wait for initial node-churn-scale pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-scale
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before node-churn stimulus window
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  # ----- Signal ready to host-side node-churner.sh -----
+  # bind-mounted config dir = /root/perf-tests/clusterloader2/config in the
+  # CL2 container == $CL2_CONFIG_DIR on the host. The sentinels/ subdir is
+  # pre-created by execute.yml; we write one file per cluster named after
+  # the kubectl context. node-churner.sh polls for $cluster_count files
+  # before its first nodepool op.
+  - name: Signal CL2 ready to host-side node-churner
+    measurements:
+      - Identifier: NodeChurnReadySentinel
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 30s
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh
+          - /root/perf-tests/clusterloader2/config/sentinels
+
+  # ----- Sleep window — host-side node-churner.sh churns nodes on target -----
+  # ----- cluster during this period; peers observe via measurements.    -----
+  - name: Observe node-churn stimulus window
+    measurements:
+      - Identifier: NodeChurnObservationSleep
+        Method: Sleep
+        Params:
+          duration: {{$scaleDurationSeconds}}s
+
+  # ----- Final convergence -----
+  - name: Wait for post-node-churn pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-scale-final
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  - name: Final wait for pods to converge after node-churn
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-node-churn-scale-final
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Settle after node-churn
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  # ----- Gather measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/node-churn.yaml
+      params:
+        action: gather
+
+  # ----- Teardown -----
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh b/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh
new file mode 100755
index 0000000000..3c00b0d96a
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh
@@ -0,0 +1,1012 @@
+#!/bin/bash
+# Scale scenario #3 (Node Churn / IP Churn) — drives node-level perturbation
+# on the target cluster while CL2 measures across all clusters.
+#
+# Why this runs OUTSIDE CL2 (from execute.yml, NOT Method:Exec):
+# The CL2 docker image (ghcr.io/azure/clusterloader2) has no `az` CLI and
+# we don't control its build. `az` is a Python wheel with hundreds of MB
+# of dependencies; pre-staging it the way we pre-stage the single-binary
+# `kubectl` isn't feasible. So this script runs on the AzDO agent in a
+# background subshell launched from execute.yml, in PARALLEL with the
+# CL2 fanout (execute-parallel). CL2 on every cluster deploys baseline
+# workload + measurements and sleeps for the scenario's duration window;
+# the host-side churner drives the actual node ops; they meet again when
+# execute.yml `wait`s for the churner PID after execute-parallel returns.
+#
+# Spec mapping (scale testing.txt:68-79):
+#   * "Node scale-up/scale-down" + "Add/remove nodes continuously" → SCALE
+#     scenario: cycle target's `default` pool count ±$DELTA for $CYCLES.
+#   * "Node replacement (new IPs)" + "Force node recreation" → REPLACE
+#     scenario: drain K nodes; `az vmss delete-instances` drops VMSS capacity
+#     by K; then explicitly `az aks nodepool scale --node-count $ORIGINAL`
+#     to refill (AKS doesn't auto-refill after delete-instances — build 67133
+#     lesson). VMSS picks the next available instance IDs and provisions
+#     brand-new VMs with brand-new private IPs.
+#   * "Observe: IP update propagation, Temporary inconsistency windows" →
+#     pre/post node InternalIP snapshots, per-op duration, observed node
+#     count post-op. Peer-side propagation is captured by the parallel
+#     CL2 measurements (cilium / clustermesh-metrics / node-churn.yaml).
+#
+# Sentinel-based readiness barrier (rubber-duck design review blocker #1):
+# Per-cluster CL2 writes $SENTINEL_DIR/ready-<context> as the FIRST
+# measurement step. The churner waits up to NODE_CHURN_READY_TIMEOUT_SECONDS
+# for ALL $CLUSTER_COUNT sentinels before the first nodepool op, so peers
+# are confirmed observing before stimulus begins. If quorum isn't reached,
+# the churner aborts WITH cleanup (restore pool to original count) and
+# emits scenario_valid=false so Kusto queries can drop the run.
+#
+# Trap-based finalizer (rubber-duck blocker #4):
+# An EXIT trap unconditionally restores the target pool to original node
+# count and waits for Succeeded + Ready, capped at NODE_CHURN_FINALIZER_TIMEOUT.
+# If finalizer can't restore, emits cleanup_failed=true and execute.yml
+# breaks out of the share-infra loop (no further scenarios run on a
+# half-scaled cluster).
+#
+# Positional args (passed by execute.yml):
+#   $1  SCENARIO                          node-churn-{scale,replace,combined}
+#   $2  TARGET_CLUSTER_NAME               AKS cluster name (== kubectl context)
+#   $3  TARGET_RESOURCE_GROUP             AKS RG (same RG as `az aks show`)
+#   $4  TARGET_NODEPOOL                   workload pool name (always `default`)
+#   $5  REPORT_DIR                        absolute path; timing JSON lands here
+#   $6  SENTINEL_DIR                      absolute path; CL2 writes sentinels here
+#   $7  CLUSTER_COUNT                     expected number of ready sentinels
+#   $8  NODE_CHURN_CYCLES                 SCALE: cycles of (up+down)
+#   $9  NODE_CHURN_DELTA                  SCALE: ±N per half-cycle
+#   $10 NODE_CHURN_SETTLE_SECONDS         sleep between ops
+#   $11 NODE_REPLACE_BATCH_SIZE           REPLACE: # of VMSS instances to delete
+#   $12 NODE_CHURN_READY_TIMEOUT_SECONDS  ready-sentinel poll timeout
+#   $13 EXPECTED_DURATION_SECONDS         CL2's matching sleep window
+#   $14 TARGET_KUBECONFIG                  absolute path to target's kubeconfig
+#                                          (from $HOME/.kube/<role>.config; passed
+#                                          explicitly so we don't have to derive
+#                                          role from target_cluster_name)
+#
+# Exit codes:
+#   0 — always (soft-fail). The timing JSON's scenario_valid / cleanup_failed /
+#       per-op succeeded flags are the load-bearing signals. Exiting non-zero
+#       would cascade-fail the CL2 step → AzDO marks step failed → collect
+#       still runs (because execute.yml's share-infra loop also soft-fails)
+#       but the AzDO UI gets noisier than the actual data quality.
+
+set -uo pipefail
+
+SCENARIO="${1:?scenario required: node-churn-scale|node-churn-replace|node-churn-combined}"
+TARGET_CLUSTER_NAME="${2:?target cluster name required}"
+TARGET_RESOURCE_GROUP="${3:?target resource group required}"
+TARGET_NODEPOOL="${4:-default}"
+REPORT_DIR="${5:?report dir required}"
+SENTINEL_DIR="${6:?sentinel dir required}"
+CLUSTER_COUNT="${7:?cluster count required}"
+NODE_CHURN_CYCLES="${8:-3}"
+NODE_CHURN_DELTA="${9:-5}"
+NODE_CHURN_SETTLE_SECONDS="${10:-60}"
+NODE_REPLACE_BATCH_SIZE="${11:-10}"
+NODE_CHURN_READY_TIMEOUT_SECONDS="${12:-300}"
+EXPECTED_DURATION_SECONDS="${13:-1500}"
+TARGET_KUBECONFIG="${14:-}"
+
+# Internal bounds (not exposed via positional args — fine-tuned per scenario
+# class, not per matrix entry).
+NODE_CHURN_OP_TIMEOUT_SECONDS=900         # per `az aks nodepool scale` op
+NODE_CHURN_FINALIZER_TIMEOUT_SECONDS=900  # cleanup pool restore
+NODE_REPLACE_DRAIN_TIMEOUT_SECONDS=300    # per node drain
+NODE_REPLACE_WAIT_TIMEOUT_SECONDS=1500    # for kubelet Ready after refill (build 67133: bumped 1200→1500 — refill provisioning + bootstrap can take 12-15 min on a fresh VM)
+
+mkdir -p "$REPORT_DIR" "$SENTINEL_DIR"
+TIMING_FILE="${REPORT_DIR}/NodeChurnTimings_${TARGET_CLUSTER_NAME}.json"
+
+log() {
+  echo "node-churner: $*"
+}
+
+err() {
+  echo "node-churner ERROR: $*" >&2
+}
+
+# Resolve kubectl — prefer PATH; fall back to the pre-staged binary that
+# execute.yml puts at $CL2_CONFIG_DIR/kubectl for Method:Exec scripts. The
+# host AzDO agent should already have kubectl, but we don't want a brittle
+# dependency on agent image version. SENTINEL_DIR is $CL2_CONFIG_DIR/sentinels
+# by execute.yml's convention, so its parent is $CL2_CONFIG_DIR.
+if command -v kubectl >/dev/null 2>&1; then
+  KUBECTL=kubectl
+elif [ -x "${SENTINEL_DIR%/sentinels*}/kubectl" ]; then
+  KUBECTL="${SENTINEL_DIR%/sentinels*}/kubectl"
+  log "using pre-staged kubectl at ${KUBECTL}"
+else
+  err "kubectl not in PATH and no pre-staged binary found at ${SENTINEL_DIR%/sentinels*}/kubectl"
+  KUBECTL=""
+fi
+
+if ! command -v az >/dev/null 2>&1; then
+  err "az CLI not in PATH on AzDO agent — cannot run node-churn scenario; aborting"
+  cat > "$TIMING_FILE" <<EOF
+{
+  "scenario": "${SCENARIO}",
+  "target_context": "${TARGET_CLUSTER_NAME}",
+  "target_cluster_name": "${TARGET_CLUSTER_NAME}",
+  "target_resource_group": "${TARGET_RESOURCE_GROUP}",
+  "target_nodepool": "${TARGET_NODEPOOL}",
+  "original_node_count": 0,
+  "ready_quorum_reached": false,
+  "scenario_valid": false,
+  "cleanup_failed": false,
+  "truncated": false,
+  "started_epoch": $(date +%s),
+  "ended_epoch": $(date +%s),
+  "duration_seconds": 0,
+  "ops": [],
+  "error": "az CLI missing"
+}
+EOF
+  exit 0
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+  err "jq not in PATH on AzDO agent — required for timing JSON construction; aborting"
+  # We can't use jq for the partial JSON, but the inline heredoc above
+  # doesn't depend on jq.
+  cat > "$TIMING_FILE" <<EOF
+{
+  "scenario": "${SCENARIO}",
+  "target_context": "${TARGET_CLUSTER_NAME}",
+  "target_cluster_name": "${TARGET_CLUSTER_NAME}",
+  "target_resource_group": "${TARGET_RESOURCE_GROUP}",
+  "target_nodepool": "${TARGET_NODEPOOL}",
+  "original_node_count": 0,
+  "ready_quorum_reached": false,
+  "scenario_valid": false,
+  "cleanup_failed": false,
+  "truncated": false,
+  "started_epoch": $(date +%s),
+  "ended_epoch": $(date +%s),
+  "duration_seconds": 0,
+  "ops": [],
+  "error": "jq missing"
+}
+EOF
+  exit 0
+fi
+
+log "scenario=${SCENARIO} target=${TARGET_CLUSTER_NAME} pool=${TARGET_NODEPOOL}"
+log "params cycles=${NODE_CHURN_CYCLES} delta=${NODE_CHURN_DELTA} settle=${NODE_CHURN_SETTLE_SECONDS}s replace_batch=${NODE_REPLACE_BATCH_SIZE}"
+log "cl2 sleep window=${EXPECTED_DURATION_SECONDS}s; ready quorum=${CLUSTER_COUNT} sentinels (timeout ${NODE_CHURN_READY_TIMEOUT_SECONDS}s)"
+
+# Persistent debug log — captures EVERY abort path's diagnostic dump so
+# postmortem doesn't depend on AzDO retaining stdout. Lives alongside
+# NodeChurnTimings_*.json in the per-cluster report dir, gets uploaded
+# with the rest of the artifacts. Survives task cancellation.
+DEBUG_LOG="${REPORT_DIR}/node-churner-debug.log"
+: > "$DEBUG_LOG"
+
+# State vars referenced by debug_dump — initialized early so any abort
+# path (before main scenario dispatch) can call debug_dump safely under
+# `set -u`. They're re-initialized to their authoritative values later
+# when the scenario actually runs.
+STARTED_EPOCH=$(date +%s)
+READY_QUORUM_REACHED=false
+SCENARIO_VALID=true
+CLEANUP_FAILED=false
+TRUNCATED=false
+CIRCUIT_BROKEN=false
+OPS_JSON='[]'
+ORIGINAL_NODE_COUNT=0
+NODE_RESOURCE_GROUP=""
+TARGET_VMSS=""
+
+debug_dump() {
+  local _label="$1"
+  {
+    echo ""
+    echo "================================================================"
+    echo "=== ${_label} at $(date -u +"%Y-%m-%dT%H:%M:%SZ") (epoch=$(date +%s))"
+    echo "================================================================"
+    echo "-- runtime params --"
+    echo "scenario=${SCENARIO} target_cluster_name=${TARGET_CLUSTER_NAME} target_rg=${TARGET_RESOURCE_GROUP}"
+    echo "target_nodepool=${TARGET_NODEPOOL} target_vmss=${TARGET_VMSS:-unset} NRG=${NODE_RESOURCE_GROUP:-unset}"
+    echo "original_node_count=${ORIGINAL_NODE_COUNT:-unset} cluster_count_quorum=${CLUSTER_COUNT}"
+    echo "ready_quorum_reached=${READY_QUORUM_REACHED} scenario_valid=${SCENARIO_VALID} circuit_broken=${CIRCUIT_BROKEN} cleanup_failed=${CLEANUP_FAILED} truncated=${TRUNCATED}"
+    echo "TARGET_KUBECONFIG=${TARGET_KUBECONFIG:-unset} KUBECTL=${KUBECTL:-unset}"
+    echo ""
+    echo "-- sentinel dir listing (${SENTINEL_DIR}) --"
+    ls -la "$SENTINEL_DIR" 2>&1 || echo "(ls failed)"
+    echo ""
+    echo "-- az aks nodepool show (target) --"
+    az aks nodepool show \
+      --cluster-name "$TARGET_CLUSTER_NAME" \
+      --resource-group "$TARGET_RESOURCE_GROUP" \
+      --name "$TARGET_NODEPOOL" \
+      --query '{count:count, provisioningState:provisioningState, powerState:powerState, vmSize:vmSize}' \
+      -o json 2>&1 || echo "(az aks nodepool show failed)"
+    echo ""
+    if [ -n "${TARGET_VMSS:-}" ] && [ -n "${NODE_RESOURCE_GROUP:-}" ]; then
+      echo "-- az vmss show (target VMSS sku.capacity) --"
+      az vmss show --resource-group "$NODE_RESOURCE_GROUP" --name "$TARGET_VMSS" \
+        --query '{capacity:sku.capacity, provisioningState:provisioningState}' \
+        -o json 2>&1 || echo "(az vmss show failed)"
+      echo ""
+      echo "-- az vmss list-instances (count + ids) --"
+      az vmss list-instances --resource-group "$NODE_RESOURCE_GROUP" --name "$TARGET_VMSS" \
+        --query 'length([])' -o tsv 2>&1 || echo "(az vmss list-instances failed)"
+    fi
+    echo ""
+    if [ -n "${KUBECTL:-}" ] && [ -n "${TARGET_KUBECONFIG:-}" ] && [ -f "$TARGET_KUBECONFIG" ]; then
+      echo "-- kubectl get nodes (target cluster) --"
+      KUBECONFIG="$TARGET_KUBECONFIG" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \
+        get nodes -l "agentpool=${TARGET_NODEPOOL}" -o wide 2>&1 | head -30 || echo "(kubectl get nodes failed)"
+      echo ""
+      echo "-- target node internal IPs --"
+      KUBECONFIG="$TARGET_KUBECONFIG" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \
+        get nodes -l "agentpool=${TARGET_NODEPOOL}" \
+        -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}' 2>&1 || true
+    else
+      echo "-- kubectl skipped (no KUBECTL or kubeconfig) --"
+    fi
+    echo ""
+    echo "-- ops recorded so far --"
+    echo "$OPS_JSON" | jq -r '.[] | "op#\(.op_index) \(.op_type) succeeded=\(.succeeded) duration=\(.duration_seconds)s observed_nodes=\(.observed_node_count) error=\"\(.error)\""' 2>&1 || echo "$OPS_JSON"
+    echo "================================================================"
+    echo ""
+  } | tee -a "$DEBUG_LOG"
+}
+
+# write_aborted_timing — emit a minimal timing JSON for any early-exit
+# code path (az missing, jq missing, can't resolve nodepool / VMSS, etc.)
+# so collect.py picks up evidence that the scenario was attempted.
+write_aborted_timing() {
+  local _msg="$1"
+  local _now
+  _now=$(date +%s)
+  cat > "$TIMING_FILE" <<EOF
+{
+  "scenario": "${SCENARIO}",
+  "target_context": "${TARGET_CLUSTER_NAME}",
+  "target_cluster_name": "${TARGET_CLUSTER_NAME}",
+  "target_resource_group": "${TARGET_RESOURCE_GROUP}",
+  "target_nodepool": "${TARGET_NODEPOOL}",
+  "target_node_resource_group": "",
+  "target_vmss": "",
+  "original_node_count": 0,
+  "ready_quorum_reached": false,
+  "scenario_valid": false,
+  "cleanup_failed": false,
+  "truncated": false,
+  "started_epoch": ${_now},
+  "ended_epoch": ${_now},
+  "duration_seconds": 0,
+  "ops": [],
+  "error": "${_msg}"
+}
+EOF
+}
+
+# -----------------------------------------------------------------------------
+# Resolve original pool size + VMSS info
+# -----------------------------------------------------------------------------
+ORIGINAL_NODE_COUNT=$(az aks nodepool show \
+  --cluster-name "$TARGET_CLUSTER_NAME" \
+  --resource-group "$TARGET_RESOURCE_GROUP" \
+  --name "$TARGET_NODEPOOL" \
+  --query count -o tsv 2>/dev/null || echo "")
+if [ -z "$ORIGINAL_NODE_COUNT" ] || ! [[ "$ORIGINAL_NODE_COUNT" =~ ^[0-9]+$ ]]; then
+  err "could not resolve original node count for ${TARGET_CLUSTER_NAME}/${TARGET_NODEPOOL}; aborting"
+  write_aborted_timing "could not resolve original node count for ${TARGET_CLUSTER_NAME}/${TARGET_NODEPOOL}"
+  exit 0
+fi
+log "original node count = ${ORIGINAL_NODE_COUNT}"
+
+# AKS puts VMSS in the node resource group ("MC_<rg>_<cluster>_<region>").
+NODE_RESOURCE_GROUP=$(az aks show \
+  --resource-group "$TARGET_RESOURCE_GROUP" \
+  --name "$TARGET_CLUSTER_NAME" \
+  --query nodeResourceGroup -o tsv 2>/dev/null || echo "")
+if [ -z "$NODE_RESOURCE_GROUP" ]; then
+  err "could not resolve nodeResourceGroup for ${TARGET_CLUSTER_NAME}; aborting"
+  write_aborted_timing "could not resolve nodeResourceGroup for ${TARGET_CLUSTER_NAME}"
+  exit 0
+fi
+
+# Discover the VMSS backing this nodepool. AKS tags VMSS with
+# aks-managed-poolName=<nodepool>. Exactly one match expected.
+TARGET_VMSS=$(az vmss list \
+  --resource-group "$NODE_RESOURCE_GROUP" \
+  --query "[?tags.\"aks-managed-poolName\"=='${TARGET_NODEPOOL}'].name | [0]" \
+  -o tsv 2>/dev/null || echo "")
+if [ -z "$TARGET_VMSS" ]; then
+  err "could not resolve VMSS for pool ${TARGET_NODEPOOL} in ${NODE_RESOURCE_GROUP}; aborting"
+  write_aborted_timing "could not resolve VMSS for pool ${TARGET_NODEPOOL} in ${NODE_RESOURCE_GROUP}"
+  exit 0
+fi
+log "target VMSS=${TARGET_VMSS} in NRG=${NODE_RESOURCE_GROUP}"
+
+# -----------------------------------------------------------------------------
+# Timing-JSON accumulator. We keep state in shell vars + an ops jq array, and
+# rewrite the timing file at every milestone so a crashed/SIGKILL'd run still
+# leaves a partial-state file behind.
+#
+# Note: STARTED_EPOCH / *_FAILED / *_VALID / OPS_JSON are already initialized
+# above (right after DEBUG_LOG) so debug_dump callable from any early-exit
+# path. Don't re-initialize here.
+# -----------------------------------------------------------------------------
+
+write_timing_file() {
+  local _ended _dur
+  _ended=$(date +%s)
+  _dur=$(( _ended - STARTED_EPOCH ))
+  jq -n \
+    --arg scenario "$SCENARIO" \
+    --arg target_context "$TARGET_CLUSTER_NAME" \
+    --arg target_cluster_name "$TARGET_CLUSTER_NAME" \
+    --arg target_resource_group "$TARGET_RESOURCE_GROUP" \
+    --arg target_nodepool "$TARGET_NODEPOOL" \
+    --arg target_node_resource_group "$NODE_RESOURCE_GROUP" \
+    --arg target_vmss "$TARGET_VMSS" \
+    --argjson original_node_count "$ORIGINAL_NODE_COUNT" \
+    --argjson ready_quorum_reached "$READY_QUORUM_REACHED" \
+    --argjson scenario_valid "$SCENARIO_VALID" \
+    --argjson cleanup_failed "$CLEANUP_FAILED" \
+    --argjson truncated "$TRUNCATED" \
+    --argjson started_epoch "$STARTED_EPOCH" \
+    --argjson ended_epoch "$_ended" \
+    --argjson duration_seconds "$_dur" \
+    --argjson ops "$OPS_JSON" \
+    '{scenario:$scenario, target_context:$target_context,
+      target_cluster_name:$target_cluster_name,
+      target_resource_group:$target_resource_group,
+      target_nodepool:$target_nodepool,
+      target_node_resource_group:$target_node_resource_group,
+      target_vmss:$target_vmss,
+      original_node_count:$original_node_count,
+      ready_quorum_reached:$ready_quorum_reached,
+      scenario_valid:$scenario_valid,
+      cleanup_failed:$cleanup_failed,
+      truncated:$truncated,
+      started_epoch:$started_epoch,
+      ended_epoch:$ended_epoch,
+      duration_seconds:$duration_seconds,
+      ops:$ops}' > "${TIMING_FILE}.tmp" && mv "${TIMING_FILE}.tmp" "$TIMING_FILE"
+}
+
+# Append one op record to OPS_JSON. Args:
+#   $1 op_index, $2 op_type, $3 start_epoch, $4 end_epoch,
+#   $5 succeeded (true|false), $6 observed_node_count,
+#   $7 pre_state_json  — JSON object {"ips":[...], "names":[...]} ('{}' = empty)
+#   $8 post_state_json — JSON object {"ips":[...], "names":[...]} ('{}' = empty)
+#   $9 error_message (empty string OK)
+#
+# Build 67155 lesson: pre_ip_set/post_ip_set alone is a FLAWED replacement
+# signal because Azure VNet allocator immediately reuses freed private IPs
+# (we deleted vmss-instance 19 at 10.1.0.19; the replacement got 10.1.0.19
+# again). Authoritative signal is NODE NAME delta (VMSS instance IDs are
+# monotonic — vmss00000j → vmss00000k — not reused). jq below computes
+# BOTH new_ip_count and new_node_count; downstream queries should prefer
+# new_node_count for "did replacement actually happen".
+record_op() {
+  local _idx="$1" _type="$2" _t0="$3" _t1="$4" _ok="$5" _ncount="$6"
+  local _pre="$7" _post="$8" _err="${9:-}"
+  local _dur=$(( _t1 - _t0 ))
+  OPS_JSON=$(jq -c \
+    --argjson idx "$_idx" \
+    --arg type "$_type" \
+    --argjson t0 "$_t0" \
+    --argjson t1 "$_t1" \
+    --argjson dur "$_dur" \
+    --argjson ok "$_ok" \
+    --argjson ncount "$_ncount" \
+    --argjson pre "$_pre" \
+    --argjson post "$_post" \
+    --arg err "$_err" \
+    '. + [{
+       op_index:$idx, op_type:$type, start_epoch:$t0, end_epoch:$t1,
+       duration_seconds:$dur, succeeded:$ok, observed_node_count:$ncount,
+       pre_ip_set:    ($pre.ips   // []),
+       post_ip_set:   ($post.ips  // []),
+       pre_node_names:  ($pre.names  // []),
+       post_node_names: ($post.names // []),
+       new_ip_count:   ([($post.ips   // [])[] | select(. as $p | (($pre.ips   // []) | index($p)) | not)] | length),
+       new_node_count: ([($post.names // [])[] | select(. as $p | (($pre.names // []) | index($p)) | not)] | length),
+       error:$err
+     }]' \
+    <<< "$OPS_JSON")
+  write_timing_file
+}
+
+# Wait for VMSS provisioningState=Succeeded with timeout. Returns 0 on success,
+# 1 on timeout. Polls every 10s.
+wait_vmss_succeeded() {
+  local _timeout="${1:-$NODE_CHURN_OP_TIMEOUT_SECONDS}"
+  local _deadline=$(( $(date +%s) + _timeout ))
+  while [ "$(date +%s)" -lt "$_deadline" ]; do
+    local _state
+    _state=$(az aks nodepool show \
+      --cluster-name "$TARGET_CLUSTER_NAME" \
+      --resource-group "$TARGET_RESOURCE_GROUP" \
+      --name "$TARGET_NODEPOOL" \
+      --query provisioningState -o tsv 2>/dev/null || echo "Unknown")
+    if [ "$_state" = "Succeeded" ]; then
+      return 0
+    fi
+    sleep 10
+  done
+  return 1
+}
+
+# Resolve target kubeconfig — TARGET_KUBECONFIG (positional arg 14) is
+# the authoritative path passed by execute.yml from clusters_with_kubeconfig.
+# Fallbacks (legacy / robustness) below.
+resolve_target_kubeconfig() {
+  local _kc="$TARGET_KUBECONFIG"
+  if [ -n "$_kc" ] && [ -f "$_kc" ]; then
+    echo "$_kc"; return
+  fi
+  _kc="$HOME/.kube/mesh-${TARGET_CLUSTER_NAME#clustermesh-}.config"
+  if [ -f "$_kc" ]; then
+    echo "$_kc"; return
+  fi
+  _kc="$HOME/.kube/config"
+  if [ -f "$_kc" ]; then
+    echo "$_kc"; return
+  fi
+  echo ""
+}
+
+# Run `kubectl get nodes -o json` against the target cluster, capturing
+# BOTH stdout and stderr. Logs stderr to DEBUG_LOG so we can postmortem
+# failure modes (auth errors, network, label-selector drift) — build
+# 67126 lost this visibility because the old kubectl invocations had
+# `2>/dev/null`.
+#
+# Returns 0 on success and prints the JSON to stdout; returns 1 on
+# kubectl failure and prints nothing.
+target_kubectl_get_nodes_json() {
+  local _kc _out _rc
+  _kc=$(resolve_target_kubeconfig)
+  if [ -z "$_kc" ] || [ -z "$KUBECTL" ]; then
+    {
+      echo "===== kubectl get nodes: NO kubeconfig/kubectl ($(date -u +%FT%TZ)) ====="
+      echo "TARGET_KUBECONFIG=${TARGET_KUBECONFIG:-unset}"
+      echo "resolved=${_kc:-empty} KUBECTL=${KUBECTL:-empty}"
+    } >> "$DEBUG_LOG"
+    return 1
+  fi
+  _out=$(KUBECONFIG="$_kc" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \
+    get nodes -o json 2>>"$DEBUG_LOG")
+  _rc=$?
+  if [ "$_rc" -ne 0 ] || [ -z "$_out" ]; then
+    {
+      echo "===== kubectl get nodes FAILED rc=${_rc} at $(date -u +%FT%TZ) ====="
+      echo "kubeconfig=${_kc} context=${TARGET_CLUSTER_NAME}"
+      echo "(stderr appended above by 2>>)"
+    } >> "$DEBUG_LOG"
+    return 1
+  fi
+  echo "$_out"
+  return 0
+}
+
+# Filter nodes by TARGET_VMSS providerID — robust against AKS agentpool
+# label key drift (newer AKS clusters prefer kubernetes.azure.com/agentpool
+# over the legacy `agentpool` key). VMSS name is unique within the cluster
+# and exact-match; also implicitly excludes prompool VMSS.
+#
+# Emits "node_name vmss_instance_id" lines on stdout, one per matched node.
+target_nodes_in_target_vmss() {
+  local _json
+  _json=$(target_kubectl_get_nodes_json) || return 1
+  echo "$_json" | jq -r --arg vmss "$TARGET_VMSS" '
+    .items[]
+    | select(.spec.providerID
+        | contains("/virtualMachineScaleSets/" + $vmss + "/virtualMachines/"))
+    | "\(.metadata.name) " + (.spec.providerID | split("/virtualMachines/")[1])
+  ' 2>>"$DEBUG_LOG"
+}
+
+# Observe current node count on target cluster from K8s side. Returns "" on
+# kubectl failure — caller treats as "unknown observed count".
+observe_node_count() {
+  local _lines
+  _lines=$(target_nodes_in_target_vmss) || { echo ""; return; }
+  echo "$_lines" | grep -c . | tr -d ' '
+}
+
+# Snapshot current Internal IPs AND node names for nodes in TARGET_VMSS.
+# Returns a JSON object {"ips":[...], "names":[...]} on stdout.
+#
+# Build 67155 lesson: capture BOTH ips and names. IPs alone are unreliable
+# as a replacement signal because Azure VNet allocator immediately reuses
+# freed IPs. VMSS instance IDs (embedded in node names) are monotonic →
+# names are the authoritative replacement signal.
+#
+# On kubectl failure, returns '{"ips":[],"names":[]}' (jq logic later
+# handles empty arrays correctly: new_*_count == count of "post" entries).
+snapshot_node_state() {
+  local _json
+  _json=$(target_kubectl_get_nodes_json) || { echo '{"ips":[],"names":[]}'; return; }
+  echo "$_json" | jq -c --arg vmss "$TARGET_VMSS" '
+    [ .items[]
+      | select(.spec.providerID
+          | contains("/virtualMachineScaleSets/" + $vmss + "/virtualMachines/"))
+    ] as $matched
+    | {
+        ips:   [$matched[] | .status.addresses[] | select(.type=="InternalIP") | .address],
+        names: [$matched[] | .metadata.name]
+      }' 2>>"$DEBUG_LOG" || echo '{"ips":[],"names":[]}'
+}
+
+# Legacy compatibility shim — some call sites only need the IP set.
+# New code should prefer snapshot_node_state.
+snapshot_node_ips() {
+  snapshot_node_state | jq -c '.ips' 2>>"$DEBUG_LOG" || echo "[]"
+}
+
+# -----------------------------------------------------------------------------
+# Finalizer — runs on EVERY exit path (trap). Idempotent.
+# -----------------------------------------------------------------------------
+finalizer() {
+  local _exit_rc=$?
+  log "finalizer: starting (exit_rc=${_exit_rc}); restoring pool to original_node_count=${ORIGINAL_NODE_COUNT}"
+  local _current
+  _current=$(az aks nodepool show \
+    --cluster-name "$TARGET_CLUSTER_NAME" \
+    --resource-group "$TARGET_RESOURCE_GROUP" \
+    --name "$TARGET_NODEPOOL" \
+    --query count -o tsv 2>/dev/null || echo "$ORIGINAL_NODE_COUNT")
+  if [ "$_current" = "$ORIGINAL_NODE_COUNT" ]; then
+    log "finalizer: pool already at original_node_count; checking provisioningState"
+    if wait_vmss_succeeded "$NODE_CHURN_FINALIZER_TIMEOUT_SECONDS"; then
+      log "finalizer: pool already restored and Succeeded"
+      write_timing_file
+      return 0
+    fi
+    log "finalizer: pool count matches but provisioningState != Succeeded; will explicitly scale to nudge reconcile"
+  fi
+  # Build 67170 lesson: prior scale ops may have failed mid-scenario while
+  # AKS was still Updating. Wait for Succeeded before issuing the explicit
+  # scale-back-to-original — otherwise this scale fails with the SAME
+  # OperationNotAllowed error and cleanup_failed=true cascades incorrectly.
+  if ! wait_vmss_succeeded "$NODE_CHURN_FINALIZER_TIMEOUT_SECONDS"; then
+    err "finalizer: provisioningState never reached Succeeded within ${NODE_CHURN_FINALIZER_TIMEOUT_SECONDS}s; cannot proceed with restore"
+    CLEANUP_FAILED=true
+    debug_dump "FINALIZER cleanup_failed (waited for Succeeded; never got there)"
+    write_timing_file
+    return 1
+  fi
+  # Stderr captured to debug log (build 67170 lesson: the prior >/dev/null
+  # 2>&1 swallowed the real error message; we ended up guessing).
+  if ! az aks nodepool scale \
+      --cluster-name "$TARGET_CLUSTER_NAME" \
+      --resource-group "$TARGET_RESOURCE_GROUP" \
+      --name "$TARGET_NODEPOOL" \
+      --node-count "$ORIGINAL_NODE_COUNT" \
+      --no-wait --only-show-errors 2>/tmp/node-churner-finalizer.err; then
+    local _finalizer_err
+    _finalizer_err=$(tr '\n' ' ' < /tmp/node-churner-finalizer.err | head -c 500)
+    err "finalizer: az aks nodepool scale to ${ORIGINAL_NODE_COUNT} failed: ${_finalizer_err}"
+    echo "===== finalizer az error ====="     >> "$DEBUG_LOG"
+    cat /tmp/node-churner-finalizer.err       >> "$DEBUG_LOG"
+    echo "===== end finalizer az error =====" >> "$DEBUG_LOG"
+    CLEANUP_FAILED=true
+    debug_dump "FINALIZER cleanup_failed (az aks nodepool scale to original failed)"
+    write_timing_file
+    return 1
+  fi
+  if ! wait_vmss_succeeded "$NODE_CHURN_FINALIZER_TIMEOUT_SECONDS"; then
+    err "finalizer: pool did NOT reach Succeeded within ${NODE_CHURN_FINALIZER_TIMEOUT_SECONDS}s"
+    CLEANUP_FAILED=true
+    debug_dump "FINALIZER cleanup_failed (provisioningState != Succeeded)"
+    write_timing_file
+    return 1
+  fi
+  log "finalizer: pool restored to ${ORIGINAL_NODE_COUNT}, Succeeded"
+  write_timing_file
+  return 0
+}
+trap finalizer EXIT
+
+# Initial state — write the file so even an early abort leaves a row.
+write_timing_file
+
+# -----------------------------------------------------------------------------
+# Ready-sentinel barrier
+# -----------------------------------------------------------------------------
+log "ready-barrier: waiting for ${CLUSTER_COUNT} CL2 sentinel(s) in ${SENTINEL_DIR}"
+BARRIER_DEADLINE=$(( $(date +%s) + NODE_CHURN_READY_TIMEOUT_SECONDS ))
+while [ "$(date +%s)" -lt "$BARRIER_DEADLINE" ]; do
+  _count=$(find "$SENTINEL_DIR" -maxdepth 1 -name 'ready-*' -type f 2>/dev/null | wc -l | tr -d ' ')
+  if [ "$_count" -ge "$CLUSTER_COUNT" ]; then
+    log "ready-barrier: quorum reached (${_count}/${CLUSTER_COUNT})"
+    READY_QUORUM_REACHED=true
+    write_timing_file
+    break
+  fi
+  sleep 5
+done
+if [ "$READY_QUORUM_REACHED" != true ]; then
+  err "ready-barrier: quorum NOT reached after ${NODE_CHURN_READY_TIMEOUT_SECONDS}s (saw ${_count:-0}/${CLUSTER_COUNT}); aborting scenario"
+  SCENARIO_VALID=false
+  debug_dump "READY-BARRIER ABORT (saw ${_count:-0}/${CLUSTER_COUNT})"
+  write_timing_file
+  exit 0
+fi
+
+# -----------------------------------------------------------------------------
+# Scenario dispatch
+# -----------------------------------------------------------------------------
+OP_INDEX=0
+WALL_DEADLINE=$(( STARTED_EPOCH + EXPECTED_DURATION_SECONDS ))
+
+run_scale_phase() {
+  log "scale phase: ${NODE_CHURN_CYCLES} cycles × (up by ${NODE_CHURN_DELTA}, down by ${NODE_CHURN_DELTA})"
+  local _cur="$ORIGINAL_NODE_COUNT"
+  for _c in $(seq 1 "$NODE_CHURN_CYCLES"); do
+    # Circuit breaker — stop if a previous op tripped it.
+    if [ "$CIRCUIT_BROKEN" = true ]; then
+      log "scale phase: circuit broken; skipping remaining cycles"
+      break
+    fi
+    # ---- scale UP ----
+    local _target=$(( _cur + NODE_CHURN_DELTA ))
+    OP_INDEX=$(( OP_INDEX + 1 ))
+    log "cycle ${_c}/${NODE_CHURN_CYCLES} op#${OP_INDEX} scale_up: ${_cur} → ${_target}"
+    # Build 67170 lesson: `az aks nodepool scale` returns sync to the CLI
+    # but the underlying managed-cluster RP operation continues async.
+    # Issuing the next nodepool scale while provisioningState=Updating
+    # triggers OperationNotAllowed. Always wait for Succeeded first.
+    if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then
+      err "scale phase: provisioningState != Succeeded before scale_up op#${OP_INDEX}; aborting cycle"
+      CIRCUIT_BROKEN=true
+      SCENARIO_VALID=false
+      debug_dump "PRE-OP wait_vmss_succeeded timeout before scale_up op#${OP_INDEX}"
+      break
+    fi
+    local _pre_state
+    _pre_state=$(snapshot_node_state)
+    local _t0=$(date +%s)
+    local _err=""
+    local _ok=true
+    if ! az aks nodepool scale \
+        --cluster-name "$TARGET_CLUSTER_NAME" \
+        --resource-group "$TARGET_RESOURCE_GROUP" \
+        --name "$TARGET_NODEPOOL" \
+        --node-count "$_target" \
+        --only-show-errors 2>/tmp/node-churner-az.err; then
+      _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500)
+      _ok=false
+      # OperationNotAllowed / throttling — structural error, trip circuit breaker.
+      if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then
+        err "scale phase: structural Azure RP error on scale_up; tripping circuit breaker"
+        CIRCUIT_BROKEN=true
+        SCENARIO_VALID=false
+        debug_dump "CIRCUIT-BROKEN on scale_up op#${OP_INDEX} (Azure RP structural error)"
+      fi
+    fi
+    local _t1=$(date +%s)
+    local _ncount
+    _ncount=$(observe_node_count)
+    [ -z "$_ncount" ] && _ncount=0
+    local _post_state
+    _post_state=$(snapshot_node_state)
+    record_op "$OP_INDEX" "scale_up" "$_t0" "$_t1" "$_ok" "$_ncount" "$_pre_state" "$_post_state" "$_err"
+    [ "$_ok" = true ] && _cur="$_target"
+    sleep "$NODE_CHURN_SETTLE_SECONDS"
+
+    if [ "$CIRCUIT_BROKEN" = true ]; then
+      break
+    fi
+    # ---- scale DOWN ----
+    _target=$(( _cur - NODE_CHURN_DELTA ))
+    if [ "$_target" -lt 1 ]; then _target=1; fi
+    OP_INDEX=$(( OP_INDEX + 1 ))
+    log "cycle ${_c}/${NODE_CHURN_CYCLES} op#${OP_INDEX} scale_down: ${_cur} → ${_target}"
+    if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then
+      err "scale phase: provisioningState != Succeeded before scale_down op#${OP_INDEX}; aborting cycle"
+      CIRCUIT_BROKEN=true
+      SCENARIO_VALID=false
+      debug_dump "PRE-OP wait_vmss_succeeded timeout before scale_down op#${OP_INDEX}"
+      break
+    fi
+    _pre_state=$(snapshot_node_state)
+    _t0=$(date +%s)
+    _err=""
+    _ok=true
+    if ! az aks nodepool scale \
+        --cluster-name "$TARGET_CLUSTER_NAME" \
+        --resource-group "$TARGET_RESOURCE_GROUP" \
+        --name "$TARGET_NODEPOOL" \
+        --node-count "$_target" \
+        --only-show-errors 2>/tmp/node-churner-az.err; then
+      _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500)
+      _ok=false
+      if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then
+        err "scale phase: structural Azure RP error on scale_down; tripping circuit breaker"
+        CIRCUIT_BROKEN=true
+        SCENARIO_VALID=false
+        debug_dump "CIRCUIT-BROKEN on scale_down op#${OP_INDEX} (Azure RP structural error)"
+      fi
+    fi
+    _t1=$(date +%s)
+    _ncount=$(observe_node_count)
+    [ -z "$_ncount" ] && _ncount=0
+    _post_state=$(snapshot_node_state)
+    record_op "$OP_INDEX" "scale_down" "$_t0" "$_t1" "$_ok" "$_ncount" "$_pre_state" "$_post_state" "$_err"
+    [ "$_ok" = true ] && _cur="$_target"
+    sleep "$NODE_CHURN_SETTLE_SECONDS"
+  done
+  log "scale phase: complete (ended at cycle current_count=${_cur})"
+}
+
+run_replace_phase() {
+  log "replace phase: drain + delete ${NODE_REPLACE_BATCH_SIZE} VMSS instance(s); AKS auto-refills"
+  if [ -z "$KUBECTL" ]; then
+    err "replace phase: kubectl unavailable; skipping (cannot drain)"
+    CIRCUIT_BROKEN=true
+    SCENARIO_VALID=false
+    debug_dump "REPLACE-PHASE aborted (KUBECTL unset)"
+    return
+  fi
+
+  # ---- 1. Pre-snapshot state (IPs + node names) + pick K nodes ----
+  # Both ips AND names are recorded so post-run analysis can use whichever
+  # signal is appropriate. Build 67155 showed IPs are unreliable (Azure
+  # reuses freed private IPs); node names (VMSS instance suffix) are the
+  # authoritative replacement marker.
+  local _pre_state
+  _pre_state=$(snapshot_node_state)
+  local _kubeconfig
+  _kubeconfig=$(resolve_target_kubeconfig)
+  if [ -z "$_kubeconfig" ]; then
+    err "replace phase: could not resolve a usable kubeconfig path; aborting"
+    CIRCUIT_BROKEN=true
+    SCENARIO_VALID=false
+    debug_dump "REPLACE-PHASE aborted (no usable kubeconfig)"
+    return
+  fi
+
+  # Pick K target VMSS instance ids via the VMSS-providerID filter
+  # (label-key independent, build 67126 lesson).
+  local _node_iid_lines
+  _node_iid_lines=$(target_nodes_in_target_vmss)
+  if [ -z "$_node_iid_lines" ]; then
+    err "replace phase: 0 nodes match VMSS=${TARGET_VMSS}; aborting"
+    # Dump raw kubectl output so postmortem can see WHY (label drift,
+    # providerID format change, auth blip).
+    {
+      echo "===== REPLACE-PHASE no-nodes diagnostic ====="
+      echo "expected VMSS=${TARGET_VMSS}"
+      echo "kubeconfig=${_kubeconfig}"
+      echo "-- kubectl get nodes -o wide (raw, no label filter) --"
+      KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \
+        get nodes -o wide 2>&1 | head -50 || true
+      echo "-- kubectl get nodes -o jsonpath providerID dump --"
+      KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \
+        get nodes -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.spec.providerID}{"\n"}{end}' 2>&1 \
+        | head -50 || true
+    } >> "$DEBUG_LOG"
+    CIRCUIT_BROKEN=true
+    SCENARIO_VALID=false
+    debug_dump "REPLACE-PHASE aborted (0 nodes match VMSS=${TARGET_VMSS})"
+    return
+  fi
+
+  # Shuffle and take first K.
+  local _selected
+  if command -v shuf >/dev/null 2>&1; then
+    _selected=$(echo "$_node_iid_lines" | shuf | head -n "$NODE_REPLACE_BATCH_SIZE")
+  else
+    _selected=$(echo "$_node_iid_lines" \
+      | awk 'BEGIN{srand()} {print rand()" "$0}' \
+      | sort -k1,1n | head -n "$NODE_REPLACE_BATCH_SIZE" | cut -d" " -f2-)
+  fi
+  local _selected_count
+  _selected_count=$(echo "$_selected" | wc -l | tr -d ' ')
+  log "replace phase: selected ${_selected_count} nodes for replacement"
+  echo "$_selected" | awk '{print "  - "$1" (vmss-instance "$2")"}'
+
+  # ---- 2. Drain selected nodes (one Op record per drain) ----
+  local _instance_ids_csv=""
+  while IFS= read -r _line; do
+    [ -z "$_line" ] && continue
+    local _node_name="${_line%% *}"
+    local _instance_id="${_line##* }"
+    OP_INDEX=$(( OP_INDEX + 1 ))
+    log "op#${OP_INDEX} replace_drain: ${_node_name} (vmss-instance ${_instance_id})"
+    local _t0=$(date +%s)
+    local _err=""
+    local _ok=true
+    # Cordon first (idempotent + cheap), then drain. timeout caps per-node
+    # so a stuck PDB doesn't block the whole batch.
+    KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \
+      cordon "$_node_name" >/dev/null 2>&1 || true
+    if ! KUBECONFIG="$_kubeconfig" "$KUBECTL" --context "$TARGET_CLUSTER_NAME" \
+        drain "$_node_name" --ignore-daemonsets --delete-emptydir-data --force \
+        --grace-period=30 \
+        --timeout="${NODE_REPLACE_DRAIN_TIMEOUT_SECONDS}s" 2>/tmp/node-churner-drain.err; then
+      _err=$(tr '\n' ' ' < /tmp/node-churner-drain.err | head -c 500)
+      _ok=false
+      # Drain failure isn't fatal — AKS will still drain the node when we
+      # delete the VMSS instance underneath. Record and continue.
+      log "replace phase: drain ${_node_name} returned non-zero; continuing (VMSS delete will force)"
+    fi
+    local _t1=$(date +%s)
+    record_op "$OP_INDEX" "replace_drain" "$_t0" "$_t1" "$_ok" 0 '{}' '{}' "$_err"
+    if [ -n "$_instance_ids_csv" ]; then
+      _instance_ids_csv="${_instance_ids_csv} ${_instance_id}"
+    else
+      _instance_ids_csv="${_instance_id}"
+    fi
+  done <<< "$_selected"
+
+  if [ "$CIRCUIT_BROKEN" = true ]; then
+    log "replace phase: circuit broken before VMSS delete"
+    return
+  fi
+  if [ -z "$_instance_ids_csv" ]; then
+    err "replace phase: no instance IDs collected; aborting"
+    CIRCUIT_BROKEN=true
+    SCENARIO_VALID=false
+    debug_dump "REPLACE-PHASE aborted (no instance ids after drain loop)"
+    return
+  fi
+
+  # ---- 3. Delete selected VMSS instances in a single batched call ----
+  OP_INDEX=$(( OP_INDEX + 1 ))
+  log "op#${OP_INDEX} replace_delete: deleting VMSS instances [${_instance_ids_csv}]"
+  # Wait for AKS to settle before issuing the next RP op (build 67170 race fix).
+  if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then
+    err "replace phase: provisioningState != Succeeded before replace_delete; tripping circuit breaker"
+    CIRCUIT_BROKEN=true
+    SCENARIO_VALID=false
+    debug_dump "PRE-OP wait_vmss_succeeded timeout before replace_delete op#${OP_INDEX}"
+    return
+  fi
+  local _t0=$(date +%s)
+  local _err=""
+  local _ok=true
+  # shellcheck disable=SC2086  # word splitting intentional for instance ids
+  if ! az vmss delete-instances \
+      --resource-group "$NODE_RESOURCE_GROUP" \
+      --name "$TARGET_VMSS" \
+      --instance-ids ${_instance_ids_csv} \
+      --only-show-errors 2>/tmp/node-churner-az.err; then
+    _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500)
+    _ok=false
+    if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then
+      err "replace phase: structural Azure RP error on vmss delete-instances; tripping circuit breaker"
+      CIRCUIT_BROKEN=true
+      SCENARIO_VALID=false
+      debug_dump "CIRCUIT-BROKEN on replace_delete op#${OP_INDEX} (Azure RP structural error)"
+    fi
+  fi
+  local _t1=$(date +%s)
+  local _ncount
+  _ncount=$(observe_node_count)
+  [ -z "$_ncount" ] && _ncount=0
+  record_op "$OP_INDEX" "replace_delete" "$_t0" "$_t1" "$_ok" "$_ncount" '{}' '{}' "$_err"
+
+  if [ "$CIRCUIT_BROKEN" = true ]; then return; fi
+
+  # ---- 4. Explicit refill via AKS nodepool scale ----
+  # Build 67133 lesson: `az vmss delete-instances` drops VMSS capacity by K,
+  # and AKS observes the drop (nodepool count goes from N to N-K) but does
+  # NOT auto-refill back to N. The finalizer's `az aks nodepool scale
+  # --node-count $ORIGINAL` succeeded → so the explicit re-scale IS the
+  # correct primitive. Run it here as a dedicated op so the timing JSON
+  # records the refill latency separately from the kubelet-Ready wait.
+  #
+  # AKS-side refill picks up the next available VMSS instance ID and
+  # provisions a brand-new VM with a brand-new InternalIP — exactly the
+  # IP-churn signal the spec asks for.
+  OP_INDEX=$(( OP_INDEX + 1 ))
+  log "op#${OP_INDEX} replace_refill: az aks nodepool scale → ${ORIGINAL_NODE_COUNT} (re-add ${NODE_REPLACE_BATCH_SIZE} replacement(s))"
+  if ! wait_vmss_succeeded "$NODE_CHURN_OP_TIMEOUT_SECONDS"; then
+    err "replace phase: provisioningState != Succeeded before replace_refill; tripping circuit breaker"
+    CIRCUIT_BROKEN=true
+    SCENARIO_VALID=false
+    debug_dump "PRE-OP wait_vmss_succeeded timeout before replace_refill op#${OP_INDEX}"
+    return
+  fi
+  _t0=$(date +%s)
+  _err=""
+  _ok=true
+  if ! az aks nodepool scale \
+      --cluster-name "$TARGET_CLUSTER_NAME" \
+      --resource-group "$TARGET_RESOURCE_GROUP" \
+      --name "$TARGET_NODEPOOL" \
+      --node-count "$ORIGINAL_NODE_COUNT" \
+      --only-show-errors 2>/tmp/node-churner-az.err; then
+    _err=$(tr '\n' ' ' < /tmp/node-churner-az.err | head -c 500)
+    _ok=false
+    if echo "$_err" | grep -qiE 'OperationNotAllowed|TooManyRequests|429|conflict'; then
+      err "replace phase: structural Azure RP error on replace_refill; tripping circuit breaker"
+      CIRCUIT_BROKEN=true
+      SCENARIO_VALID=false
+      debug_dump "CIRCUIT-BROKEN on replace_refill op#${OP_INDEX} (Azure RP structural error)"
+    fi
+  fi
+  _t1=$(date +%s)
+  _ncount=$(observe_node_count)
+  [ -z "$_ncount" ] && _ncount=0
+  record_op "$OP_INDEX" "replace_refill" "$_t0" "$_t1" "$_ok" "$_ncount" '{}' '{}' "$_err"
+
+  if [ "$CIRCUIT_BROKEN" = true ]; then return; fi
+
+  # ---- 5. Wait for K8s Ready node count to return to ORIGINAL ----
+  # AKS nodepool scale returns when Azure provisioning is complete, but
+  # kubelet on the new VM still needs to register + reach Ready. Poll
+  # kubectl until Ready count == ORIGINAL (not just VMSS provisioningState).
+  OP_INDEX=$(( OP_INDEX + 1 ))
+  log "op#${OP_INDEX} replace_wait: waiting for ${ORIGINAL_NODE_COUNT} Ready nodes in pool"
+  _t0=$(date +%s)
+  _err=""
+  _ok=false
+  local _wait_deadline=$(( _t0 + NODE_REPLACE_WAIT_TIMEOUT_SECONDS ))
+  local _ready_count=0
+  while [ "$(date +%s)" -lt "$_wait_deadline" ]; do
+    # Count Ready nodes whose providerID is in our target VMSS (label-
+    # selector-agnostic; build 67126 regression fix).
+    local _ready_json
+    _ready_json=$(target_kubectl_get_nodes_json 2>/dev/null)
+    if [ -n "$_ready_json" ]; then
+      _ready_count=$(echo "$_ready_json" | jq -r --arg vmss "$TARGET_VMSS" '
+        [ .items[]
+          | select(.spec.providerID | contains("/virtualMachineScaleSets/" + $vmss + "/virtualMachines/"))
+          | .status.conditions[]
+          | select(.type=="Ready" and .status=="True") ] | length' 2>/dev/null || echo 0)
+    else
+      _ready_count=0
+    fi
+    if [ "$_ready_count" -ge "$ORIGINAL_NODE_COUNT" ]; then
+      _ok=true
+      break
+    fi
+    sleep 10
+  done
+  _t1=$(date +%s)
+  local _post_state
+  _post_state=$(snapshot_node_state)
+  if [ "$_ok" != true ]; then
+    _err="replace_wait: timeout after ${NODE_REPLACE_WAIT_TIMEOUT_SECONDS}s; ready=${_ready_count}/${ORIGINAL_NODE_COUNT}"
+    err "$_err"
+    SCENARIO_VALID=false
+    debug_dump "REPLACE_WAIT timeout (ready=${_ready_count}/${ORIGINAL_NODE_COUNT})"
+  fi
+  record_op "$OP_INDEX" "replace_wait" "$_t0" "$_t1" "$_ok" "$_ready_count" "$_pre_state" "$_post_state" "$_err"
+  # Pull new_node_count from the just-recorded op for the summary log line.
+  local _new_node_count _new_ip_count
+  _new_node_count=$(echo "$OPS_JSON" | jq -r '.[-1].new_node_count')
+  _new_ip_count=$(echo "$OPS_JSON" | jq -r '.[-1].new_ip_count')
+  log "replace phase: complete (new_node_count=${_new_node_count} [authoritative], new_ip_count=${_new_ip_count} [informational; Azure may reuse freed IPs])"
+}
+
+case "$SCENARIO" in
+  node-churn-scale)
+    run_scale_phase
+    ;;
+  node-churn-replace)
+    run_replace_phase
+    ;;
+  node-churn-combined)
+    run_scale_phase
+    if [ "$CIRCUIT_BROKEN" != true ]; then
+      log "transitioning from scale phase to replace phase"
+      sleep "$NODE_CHURN_SETTLE_SECONDS"
+      run_replace_phase
+    else
+      log "scale phase circuit-broken; skipping replace phase"
+    fi
+    ;;
+  *)
+    err "unknown scenario '${SCENARIO}'; expected node-churn-{scale,replace,combined}"
+    SCENARIO_VALID=false
+    ;;
+esac
+
+# Truncation check: did we run past CL2's sleep window?
+if [ "$(date +%s)" -gt "$WALL_DEADLINE" ]; then
+  log "WARN: churner ran past CL2 sleep window (${EXPECTED_DURATION_SECONDS}s); peer measurements may be truncated"
+  TRUNCATED=true
+fi
+
+write_timing_file
+log "scenario complete; finalizer will run via EXIT trap"
+exit 0
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-combined.yaml b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-combined.yaml
new file mode 100644
index 0000000000..7b4a1f8ea1
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-combined.yaml
@@ -0,0 +1,330 @@
+name: clustermesh-pod-churn-combined
+
+# Combined Phase 4a config — single CL2 invocation runs scale-cycle then
+# kill against the SAME workload deployment. Goal: extract maximum signal
+# per (expensive) n20 provision/destroy lifecycle by exercising both
+# stressor flavors of Scenario #2 back-to-back.
+#
+# Sequence:
+#   1. Start measurements (control-plane, cilium, clustermesh-{metrics,
+#      throughput}, etcd-metrics, pod-churn-stress).
+#   2. Deploy PodMonitor.
+#   3. Create workload at full replicas + WaitForControlledPodsRunning gate.
+#   4. PHASE A — Scale-cycle stress (deterministic):
+#        $churnCycles iterations of (scale-down 0 → sleep down → scale-up N
+#        → sleep up). No per-cycle wait; let it churn freely.
+#   5. Intermediate WaitForControlledPodsRunning gather + brief settle.
+#   6. PHASE B — Kill stress (stochastic): Method: Exec runs
+#      pod-churn-killer.sh inside the CL2 docker container, deleting
+#      $killBatch random workload pods every $killIntervalSeconds for
+#      $killDurationSeconds. ReplicaSet re-creates them, driving the
+#      failure-driven event path. If kubectl is unavailable in the CL2
+#      image (Method: Exec dependency), this measurement returns 127 and
+#      CL2 marks it failed but the surrounding settle/gather/teardown
+#      steps still run, preserving Phase A scale-cycle data.
+#   7. Final WaitForControlledPodsRunning gather + settle.
+#   8. Gather measurements (all modules above).
+#   9. Teardown (workload + PodMonitor).
+#
+# Knob values come from the same CL2_* overrides scale.py writes for the
+# split scale/kill scenarios, so the existing matrix-var plumbing in
+# steps/engine/clusterloader2/clustermesh-scale/execute.yml works without
+# modification.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}}
+{{$churnCycles := DefaultParam .CL2_CHURN_CYCLES 5}}
+{{$churnUpDuration := DefaultParam .CL2_CHURN_UP_DURATION "60s"}}
+{{$churnDownDuration := DefaultParam .CL2_CHURN_DOWN_DURATION "60s"}}
+{{$killDurationSeconds := DefaultParam .CL2_KILL_DURATION_SECONDS 600}}
+{{$killIntervalSeconds := DefaultParam .CL2_KILL_INTERVAL_SECONDS 10}}
+{{$killBatch := DefaultParam .CL2_KILL_BATCH 5}}
+# Method: Exec timeout — must exceed kill duration with margin so the
+# loop's deadline check fires before this hard cap. Set to 1.5x kill
+# duration as defense-in-depth.
+{{$killExecTimeout := DefaultParam .CL2_KILL_EXEC_TIMEOUT "15m"}}
+
+{{$workloadGroup := "clustermesh-pod-churn-combined"}}
+{{$workloadBasename := "pcc"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-pcc
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  # ----- ACNS namespace opt-in (CFP-39876) -----
+  # AKS-managed Cilium ships with clustermesh-default-global-namespace=false,
+  # so workload namespaces need clustermesh.cilium.io/global=true to sync
+  # their CiliumIdentity/Endpoint/Services across the mesh. Without this,
+  # cross-cluster propagation metrics (e.g. cilium_clustermesh_global_services)
+  # are structurally 0 regardless of pod churn. See plan.md note #14 + ACNS
+  # team confirmation 2026-05-11 (David Vadas / Isaiah Raya). Runs FIRST so
+  # the annotation is in place before any CiliumIdentity / Endpoint forms.
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-pcc"
+
+  # ----- Start measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  # ----- Workload deploy + initial settle -----
+  - name: Start tracking pod-churn-combined Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-combined-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Wait for initial pod-churn-combined pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-combined-initial
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before phase A
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  # ----- PHASE A: scale-cycle stress -----
+  - name: Start tracking pod-churn scale-cycle phase
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-phase-a
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  {{range $i := Loop $churnCycles}}
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Phase A cycle {{$i}} — down hold
+    measurements:
+      - Identifier: PhaseADownSleep-{{$i}}
+        Method: Sleep
+        Params:
+          duration: {{$churnDownDuration}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Phase A cycle {{$i}} — up hold
+    measurements:
+      - Identifier: PhaseAUpSleep-{{$i}}
+        Method: Sleep
+        Params:
+          duration: {{$churnUpDuration}}
+  {{end}}
+
+  - name: Wait for post-scale-cycle pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-phase-a
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Brief settle between Phase A and Phase B
+    measurements:
+      - Identifier: InterPhaseSleep
+        Method: Sleep
+        Params:
+          duration: 30s
+
+  # ----- PHASE B: kill stress via Method: Exec -----
+  # Method: Exec runs the killer script inside the CL2 docker container.
+  # The container has /root/.kube/config (the per-cluster kubeconfig)
+  # mounted by run_cl2_command. The script uses kubectl from $PATH in
+  # the CL2 image; if missing it exits 127, this measurement is marked
+  # failed, but subsequent steps (settle, gather, teardown) still run.
+  - name: Phase B pod-churn kill loop
+    measurements:
+      - Identifier: PodChurnKillLoop
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: {{$killExecTimeout}}
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/pod-churn-killer.sh
+          - "{{$killDurationSeconds}}"
+          - "{{$killIntervalSeconds}}"
+          - "{{$killBatch}}"
+          - "{{$workloadGroup}}"
+
+  # ----- Final convergence -----
+  - name: Start tracking post-kill convergence
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-post-combined
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  - name: Wait for post-kill pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-post-combined
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Settle after combined churn
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  # ----- Gather measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: gather
+
+  # ----- Teardown -----
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-kill.yaml b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-kill.yaml
new file mode 100644
index 0000000000..7055652793
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-kill.yaml
@@ -0,0 +1,308 @@
+name: clustermesh-pod-churn-kill
+
+# Scale scenario #2 (Pod Churn Stress Test) — random pod kill variant.
+#
+# Spec (scale testing.txt line 64): "Kill pods at random intervals."
+#
+# This complements pod-churn-scale.yaml: instead of cycling Deployment .spec.replicas
+# (deterministic, controller-driven churn), we deploy an in-cluster killer Job
+# that picks $killBatch random pods every $killInterval and force-deletes them.
+# The ReplicaSet immediately re-creates them, exercising the failure-driven
+# event path. Both halves of scenario #2 produce overlapping but
+# distinguishable mesh signals: scale-cycle is steady-state, predictable;
+# kill is bursty, ReplicaSet-driven.
+#
+# Killer Job runs for ${killDuration}s then exits 0 cleanly. The Job's
+# activeDeadlineSeconds is set to killDuration + 60s buffer as a defense-in-depth
+# bound. WaitForFinishedJobs gathers the completion signal — no explicit
+# delete-and-wait dance.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}}
+{{$killDuration := DefaultParam .CL2_KILL_DURATION "10m"}}
+{{$killIntervalSeconds := DefaultParam .CL2_KILL_INTERVAL_SECONDS 10}}
+{{$killBatch := DefaultParam .CL2_KILL_BATCH 5}}
+{{$killDurationSeconds := DefaultParam .CL2_KILL_DURATION_SECONDS 600}}
+{{$jobDeadlineSeconds := DefaultParam .CL2_KILL_JOB_DEADLINE_SECONDS 660}}
+# Hard-coded — repeated below for the killer's --label-selector and the
+# workload's group label. Keep these in sync.
+{{$workloadGroup := "clustermesh-pod-churn-kill"}}
+{{$killerGroup := "clustermesh-pod-churn-killer"}}
+{{$workloadBasename := "pck"}}
+# bitnami/kubectl image already trusted in this repo (modules/kustomize/fio/.../ds.yaml).
+# Ships bash + shuf + xargs + cut + kubectl which the killer script depends on.
+{{$killerImage := DefaultParam .CL2_KILLER_IMAGE "telescope.azurecr.io/bitnami/kubectl:v1.33.2"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-pck
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  # ----- ACNS namespace opt-in (CFP-39876) -----
+  # AKS-managed Cilium ships with clustermesh-default-global-namespace=false,
+  # so workload namespaces need clustermesh.cilium.io/global=true to sync
+  # their CiliumIdentity/Endpoint/Services across the mesh. Without this,
+  # cross-cluster propagation metrics are structurally 0. See plan.md
+  # note #14 + ACNS team confirmation 2026-05-11.
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-pck"
+
+  # ----- Start measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  # ----- Workload deploy + initial settle -----
+  - name: Start tracking pod-churn-kill Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-kill
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Wait for initial pod-churn-kill pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-kill
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Warmup before kill
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  # ----- Killer deploy -----
+  # Distinct basenames per kind so the binding's RoleName/SAName references
+  # are unambiguous and don't depend on CL2's cross-kind name-collision
+  # behavior. All four objects share namespace `default` (universal),
+  # replicasPerNamespace: 1.
+  - name: Register WaitForFinishedJobs for killer
+    measurements:
+      - Identifier: WaitForFinishedJobs-killer
+        Method: WaitForFinishedJobs
+        Params:
+          action: start
+          labelSelector: group={{$killerGroup}}
+          # Killer's activeDeadlineSeconds bounds the Job's lifetime;
+          # this WaitForFinishedJobs timeout has to exceed that with margin
+          # so the gather doesn't time out while the killer is still inside
+          # its grace period.
+          timeout: {{$operationTimeout}}
+
+  - name: Deploy pod-churn killer
+    phases:
+      - namespaceList: ["default"]
+        replicasPerNamespace: 1
+        tuningSet: Sequence
+        objectBundle:
+          - basename: pck-sa
+            objectTemplatePath: /modules/pod-churn-killer-sa.yaml
+            templateFillMap:
+              Group: {{$killerGroup}}
+          - basename: pck-cr
+            objectTemplatePath: /modules/pod-churn-killer-clusterrole.yaml
+            templateFillMap:
+              Group: {{$killerGroup}}
+          - basename: pck-crb
+            objectTemplatePath: /modules/pod-churn-killer-clusterrolebinding.yaml
+            templateFillMap:
+              Group: {{$killerGroup}}
+              RoleName: pck-cr-1
+              SAName: pck-sa-1
+              SANamespace: default
+          - basename: pck-job
+            objectTemplatePath: /modules/pod-churn-killer-job.yaml
+            templateFillMap:
+              Group: {{$killerGroup}}
+              SAName: pck-sa-1
+              Image: {{$killerImage}}
+              ActiveDeadlineSeconds: {{$jobDeadlineSeconds}}
+              KillDurationSeconds: {{$killDurationSeconds}}
+              KillIntervalSeconds: {{$killIntervalSeconds}}
+              KillBatch: {{$killBatch}}
+              WorkloadLabelSelector: group={{$workloadGroup}}
+
+  # ----- Wait for the killer to finish its own time-bounded run -----
+  # WaitForFinishedJobs blocks until the killer pod's status is Succeeded
+  # (clean exit 0 on deadline) or Failed (image pull error / RBAC denial /
+  # script crash). Either way, control returns here and we proceed to
+  # final reconciliation. We don't explicitly delete the Job — the
+  # Sleep + WaitForFinishedJobs is the gate.
+  - name: Wait for killer Job to complete
+    measurements:
+      - Identifier: WaitForFinishedJobs-killer
+        Method: WaitForFinishedJobs
+        Params:
+          action: gather
+
+  # ----- Re-register a fresh watcher for the post-kill convergence so the
+  # final gather only reflects pod reconciliation after the killer stopped. -----
+  - name: Start tracking post-kill convergence
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-post-kill
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$workloadGroup}}
+          operationTimeout: {{$operationTimeout}}
+
+  - name: Wait for post-kill pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-post-kill
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Settle after kill
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  # ----- Gather measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$workloadGroup}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: gather
+
+  # ----- Teardown: workload + killer (SA/CR/CRB/Job objects). -----
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$workloadGroup}}
+        basename: {{$workloadBasename}}
+
+  - name: Tear down killer resources
+    phases:
+      - namespaceList: ["default"]
+        replicasPerNamespace: 0
+        tuningSet: Sequence
+        objectBundle:
+          - basename: pck-sa
+            objectTemplatePath: /modules/pod-churn-killer-sa.yaml
+          - basename: pck-cr
+            objectTemplatePath: /modules/pod-churn-killer-clusterrole.yaml
+          - basename: pck-crb
+            objectTemplatePath: /modules/pod-churn-killer-clusterrolebinding.yaml
+          - basename: pck-job
+            objectTemplatePath: /modules/pod-churn-killer-job.yaml
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-killer.sh b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-killer.sh
new file mode 100755
index 0000000000..2268f8e126
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-killer.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+# Pod-churn killer loop — runs from inside the CL2 docker container
+# (invoked via Method: Exec from pod-churn-combined.yaml).
+#
+# Why this lives here instead of as an in-cluster Job: the in-cluster Job
+# approach requires pulling a kubectl image (e.g. bitnami/kubectl) onto
+# every AKS cluster, which needs AcrPull or a public-registry-friendly
+# CSSC-compliant image — neither is currently configured in the
+# clustermesh-scale tfvars. The CL2 container already has the kubeconfig
+# mounted at /root/.kube/config and (per Telescope's
+# job_controller/config/ray/config.yaml precedent) supports `Method: Exec`
+# with `bash`. We run kubectl from here against the same kubeconfig CL2
+# uses — no extra image pull, no extra RBAC. Plan 4a runs this against
+# one cluster per per-cluster CL2 instance (execute-parallel handles
+# fan-out).
+#
+# Positional args (passed via Method: Exec command list):
+#   $1 KILL_DURATION_SECONDS    Total runtime in seconds.
+#   $2 KILL_INTERVAL_SECONDS    Seconds between successive kill rounds.
+#   $3 KILL_BATCH               Pods deleted per round.
+#   $4 WORKLOAD_GROUP           Label-selector group value.
+#
+# Exits 0 on successful completion of the time-bounded loop. Exits 127
+# if kubectl is unavailable in this CL2 image (Method: Exec marks the
+# measurement failed; the surrounding combined.yaml still completes the
+# settle + gather steps so scale-phase data is preserved).
+
+set -u
+set -o pipefail
+
+KILL_DURATION_SECONDS="${1:-600}"
+KILL_INTERVAL_SECONDS="${2:-10}"
+KILL_BATCH="${3:-5}"
+WORKLOAD_GROUP="${4:-clustermesh-pod-churn}"
+LABEL_SELECTOR="group=${WORKLOAD_GROUP}"
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  # Fallback: the pipeline's execute.yml pre-stages kubectl into the
+  # cl2_config_dir (which is bind-mounted at /root/perf-tests/clusterloader2/config
+  # by run_cl2_command). If neither PATH kubectl nor the pre-staged binary
+  # is available, fail with a clear diagnostic.
+  PREBAKED_KUBECTL=/root/perf-tests/clusterloader2/config/kubectl
+  if [ -x "${PREBAKED_KUBECTL}" ]; then
+    KUBECTL_BIN_DIR="$(dirname "${PREBAKED_KUBECTL}")"
+    export PATH="${KUBECTL_BIN_DIR}:${PATH}"
+    echo "killer: using pre-staged kubectl at ${PREBAKED_KUBECTL}"
+  else
+    echo "killer ERROR: kubectl not in PATH inside CL2 container; "\
+         "pre-staged binary at ${PREBAKED_KUBECTL} is also missing — "\
+         "verify execute.yml pre-stage step ran successfully"
+    echo "killer ERROR: PATH=$PATH"
+    exit 127
+  fi
+fi
+
+KUBECTL_CLIENT_INFO="$(kubectl version --client=true --output=yaml 2>&1 | head -3 || true)"
+echo "killer: kubectl client info:"
+echo "${KUBECTL_CLIENT_INFO}"
+echo "killer: starting (duration=${KILL_DURATION_SECONDS}s interval=${KILL_INTERVAL_SECONDS}s batch=${KILL_BATCH} selector=${LABEL_SELECTOR})"
+
+# shuf is GNU coreutils; not guaranteed in every image base. Fall back to
+# awk-with-srand when missing — awk is part of POSIX and always available.
+HAS_SHUF=0
+if command -v shuf >/dev/null 2>&1; then
+  HAS_SHUF=1
+fi
+
+random_pick() {
+  # Reads "ns/name" lines on stdin, prints up to $1 random lines.
+  local n="$1"
+  if [ "${HAS_SHUF}" -eq 1 ]; then
+    shuf | head -n "$n"
+  else
+    awk -v n="$n" 'BEGIN{srand()} {print rand()" "$0}' | sort -k1,1n | head -n "$n" | cut -d" " -f2-
+  fi
+}
+
+END_EPOCH=$(( $(date +%s) + KILL_DURATION_SECONDS ))
+ROUND=0
+KILLED_TOTAL=0
+
+while [ "$(date +%s)" -lt "${END_EPOCH}" ]; do
+  ROUND=$((ROUND + 1))
+
+  CANDIDATES="$(kubectl get pods -A -l "${LABEL_SELECTOR}" \
+    -o 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
+
+  if [ -z "${CANDIDATES}" ]; then
+    echo "killer: round=${ROUND} no candidates matched selector ${LABEL_SELECTOR}"
+  else
+    TARGETS="$(printf '%s\n' "${CANDIDATES}" | random_pick "${KILL_BATCH}")"
+    ROUND_KILLED=0
+    while IFS= read -r nsname; do
+      [ -z "${nsname}" ] && continue
+      ns="${nsname%%/*}"
+      name="${nsname##*/}"
+      # --grace-period=0 + --force: immediate evict, no graceful shutdown
+      # wait. Simulates a "node failure"-style event for the pod-event
+      # propagation path. --ignore-not-found tolerates the inherent race
+      # where ReplicaSet has not yet replaced previous round's kills.
+      if kubectl delete pod -n "${ns}" "${name}" \
+            --grace-period=0 --force --ignore-not-found \
+            > /dev/null 2>&1; then
+        ROUND_KILLED=$((ROUND_KILLED + 1))
+      fi
+    done <<< "${TARGETS}"
+    KILLED_TOTAL=$((KILLED_TOTAL + ROUND_KILLED))
+    echo "killer: round=${ROUND} killed=${ROUND_KILLED} cumulative=${KILLED_TOTAL}"
+  fi
+
+  # Don't sleep past the deadline.
+  NOW="$(date +%s)"
+  REMAINING=$(( END_EPOCH - NOW ))
+  if [ "${REMAINING}" -le 0 ]; then
+    break
+  fi
+  SLEEP="${KILL_INTERVAL_SECONDS}"
+  if [ "${REMAINING}" -lt "${SLEEP}" ]; then
+    SLEEP="${REMAINING}"
+  fi
+  sleep "${SLEEP}"
+done
+
+echo "killer: done duration=${KILL_DURATION_SECONDS}s rounds=${ROUND} cumulative=${KILLED_TOTAL}"
+exit 0
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-scale.yaml b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-scale.yaml
new file mode 100644
index 0000000000..de791616b8
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/pod-churn-scale.yaml
@@ -0,0 +1,284 @@
+name: clustermesh-pod-churn-scale
+
+# Scale scenario #2 (Pod Churn Stress Test) — deterministic scale-cycle variant.
+#
+# Spec (scale testing.txt line 55-67): "Validate stability under high pod churn.
+# Repeatedly scale deployments up/down. Track propagation latency, missed or
+# delayed updates, CPU/memory growth over time."
+#
+# This scenario cycles each Deployment's .spec.replicas between $replicasPerDeployment
+# and 0 for $churnCycles iterations, holding each end-state for $churnUpDuration /
+# $churnDownDuration respectively. The cycle drives a steady-state stream of pod
+# create/delete events without churning Deployment or Service objects (those stay
+# present across all cycles), isolating the pod-event signal.
+#
+# Sequence:
+#   1. Start measurements (control-plane, cilium, clustermesh-metrics,
+#      clustermesh-throughput, etcd-metrics, pod-churn-stress).
+#   2. Deploy PodMonitor (clustermesh.yaml).
+#   3. Initial workload apply at full replicas + WaitForControlledPodsRunning gate
+#      (proves the workload settled before churn begins).
+#   4. Churn loop ($churnCycles iterations):
+#        a. Scale-down to replicas=0 (no wait — let it churn freely).
+#        b. Sleep $churnDownDuration.
+#        c. Scale-up to replicas=$replicasPerDeployment.
+#        d. Sleep $churnUpDuration.
+#   5. Final scale-up (idempotent — guarantees known terminal state) + final
+#      WaitForControlledPodsRunning.gather for convergence.
+#   6. Settle sleep ($holdDuration) — lets kvstore queues drain and slope queries
+#      observe the post-churn settle.
+#   7. Gather measurements (mirror start order).
+#   8. Teardown (delete workload + PodMonitor).
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}}
+{{$churnCycles := DefaultParam .CL2_CHURN_CYCLES 5}}
+{{$churnUpDuration := DefaultParam .CL2_CHURN_UP_DURATION "60s"}}
+{{$churnDownDuration := DefaultParam .CL2_CHURN_DOWN_DURATION "60s"}}
+
+{{$group := "clustermesh-pod-churn-scale"}}
+{{$basename := "pcs"}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-pcs
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  - name: DeploymentCreateQps
+    qpsLoad:
+      qps: {{$apiServerCallsPerSecond}}
+
+steps:
+  # ----- ACNS namespace opt-in (CFP-39876) -----
+  # AKS-managed Cilium ships with clustermesh-default-global-namespace=false,
+  # so workload namespaces need clustermesh.cilium.io/global=true to sync
+  # their CiliumIdentity/Endpoint/Services across the mesh. Without this,
+  # cross-cluster propagation metrics are structurally 0. See plan.md
+  # note #14 + ACNS team confirmation 2026-05-11.
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-pcs"
+
+  # ----- Start measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: start
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: DeploymentCreateQps
+
+  # ----- Initial workload create + settle -----
+  # WaitForControlledPodsRunning is registered ONCE here and gathered ONCE at
+  # the end of the churn loop. Per-cycle waits would block the cycle until
+  # pods settled, defeating the "rapid churn" intent of scenario #2.
+  - name: Start tracking pod-churn-scale Deployments
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-scale
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - name: Wait for initial pod-churn pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-scale
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  # ----- Warmup before churn -----
+  - name: Warmup before churn
+    measurements:
+      - Identifier: WarmupSleep
+        Method: Sleep
+        Params:
+          duration: {{$warmupDuration}}
+
+  # ----- Re-register a fresh watcher for the churn window so the final gather
+  # only reflects the churn loop's outcome, not the initial create. -----
+  - name: Start tracking pod-churn loop
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-loop
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          apiVersion: apps/v1
+          kind: Deployment
+          checkIfPodsAreUpdated: true
+          labelSelector: group = {{$group}}
+          operationTimeout: {{$operationTimeout}}
+
+  # ----- Churn loop -----
+  # CL2's `Loop $N` template func yields 0..N-1; we emit $churnCycles pairs of
+  # scale-down → sleep → scale-up → sleep. No per-cycle WaitForControlledPodsRunning:
+  # we WANT the system in flux during this window so the measurements observe
+  # sustained churn rather than per-cycle settle-and-spike.
+  {{range $i := Loop $churnCycles}}
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - name: Pod-churn cycle {{$i}} — down hold
+    measurements:
+      - Identifier: ChurnCycleDownSleep-{{$i}}
+        Method: Sleep
+        Params:
+          duration: {{$churnDownDuration}}
+
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: apply
+        replicas: {{$replicasPerDeployment}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - name: Pod-churn cycle {{$i}} — up hold
+    measurements:
+      - Identifier: ChurnCycleUpSleep-{{$i}}
+        Method: Sleep
+        Params:
+          duration: {{$churnUpDuration}}
+  {{end}}
+
+  # ----- Final convergence: end the churn window at a known terminal state. -----
+  - name: Wait for post-churn pods to be Running
+    measurements:
+      - Identifier: WaitForControlledPodsRunning-pod-churn-loop
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  # ----- Settle: let kvstore queues drain post-churn -----
+  - name: Settle after churn
+    measurements:
+      - Identifier: SettleSleep
+        Method: Sleep
+        Params:
+          duration: {{$holdDuration}}
+
+  # ----- Gather measurements -----
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: {{$group}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: gather
+
+  # ----- Teardown: drop Deployments + Services. -----
+  - module:
+      path: /modules/pod-churn-workload.yaml
+      params:
+        actionName: delete
+        replicas: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        tuningSet: DeploymentCreateQps
+        group: {{$group}}
+        basename: {{$basename}}
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/upper-bound.yaml b/modules/python/clusterloader2/clustermesh-scale/config/upper-bound.yaml
new file mode 100644
index 0000000000..3d7fa9e4d5
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/upper-bound.yaml
@@ -0,0 +1,329 @@
+name: clustermesh-upper-bound
+
+# Scale scenario #6: Upper Bound / Saturation Testing.
+#
+# Goal (scale testing.txt line 103-114): Find system limits safely.
+#   - Increasing clusters       → covered by the matrix (n2/n5/n10/n20
+#                                 entries each run this same CL2 config).
+#   - Increasing events per     → covered IN-RUN by ramping through N
+#     cluster                     "rungs" of progressively heavier load.
+#   - Record failure modes,     → scale.py collect's saturation classifier
+#     not just thresholds          tags each rung with the dominant signal
+#                                  ({clean, latency_spike, queue_unbounded,
+#                                   cpu_exhaust, mesh_failure_burst,
+#                                   etcd_tail}). See _emit_saturation_profile_rows.
+#
+# Per-rung structure (single CL2 invocation per cluster runs the full
+# ramp; bounded sweep, not adaptive stress-to-fail — see the rubber-duck
+# review notes in plan.md's Scenario #6 section):
+#
+#   For rung r in 0..N-1:
+#     1. Start measurements with suffix=Rung<r> (per-rung time window via
+#        CL2's %v placeholder; suffix namespaces the emitted JSONs so the
+#        Python collector can read them per-rung).
+#     2. Restart-burst the workload at TuningSet qps = qps_list[r], doing
+#        restarts_list[r] consecutive restart cycles. Each restart bumps a
+#        Deployment pod-template annotation, which triggers a rolling
+#        recreate of every replica → forces a flurry of endpoint/identity
+#        events through clustermesh-apiserver.
+#     3. Sleep rung_duration so the measurement window covers the burst
+#        AND the steady-state right after. CL2's gather queries (action:
+#        gather) substitute %v with the wall time since the matching
+#        action: start — so a longer rung_duration captures more of the
+#        post-burst tail.
+#     4. Gather measurements with suffix=Rung<r>.
+#     5. Sleep settle_duration before the next rung. The settle window is
+#        sized so kvstore queues from rung r drain before rung r+1 starts.
+#
+# After all rungs, delete the workload + PodMonitor.
+#
+# IMPORTANT design notes (don't change without re-reading rubber-duck
+# critique notes in plan.md):
+# - Single CL2 invocation per cluster, NOT N separate invocations. Keeps
+#   one Prometheus time-axis consistent across rungs; cross-rung
+#   comparison is cleaner; avoids 5× the workload-create-teardown cost.
+# - QPS alone doesn't drive kvstore events 1:1 — each rung also bumps
+#   `restartsPerRung` so cumulative events scale with rung index even
+#   when QPS saturates CL2's Deployment-apply rate. Both dials are
+#   driven by the matrix vars.
+# - The classifier verdict is computed at collect time from the per-rung
+#   measurement JSONs, NOT inside CL2. Raw signal values + thresholds +
+#   classifier_version are emitted alongside verdicts so dashboards can
+#   recompute verdicts post-hoc if thresholds need calibration.
+# - NOT share-infra-eligible in v1 — a tripped rung can leave queue/memory
+#   residue that would contaminate following scenarios. Standalone matrix
+#   entries only until baseline data justifies share-infra positioning.
+# - CL2's template engine has its OWN func map (see kubernetes/perf-tests
+#   clusterloader2/pkg/config/template_functions.go); sprig is NOT
+#   available. Use StringSplit, Loop, AddInt, MultiplyInt, SubtractInt,
+#   index, len. atoi is implicit — arithmetic funcs accept string args
+#   and parse via toFloat64.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+
+# Saturation knobs. SaturationQpsList is a comma-separated list of QPS
+# values, one per rung. SaturationRestartsList is the per-rung restart
+# count (length must match SaturationQpsList) — driven separately so
+# dashboards can distinguish "QPS axis" from "workload-amplitude axis".
+# Each rung lasts SaturationRungDurationSeconds + SaturationSettleSeconds.
+#
+# Defaults match scale.py's defaults so a forgotten matrix var falls
+# through to a 5-rung sweep at 100/500/1500/4000/10000 QPS with
+# 5/15/40/80/150 restarts per rung (5 rungs × (240s hold + 90s settle)
+# ≈ 28 min CL2 wall time per cluster). Bumped 2026-05-15 after build
+# 67224 showed all signals at 1-15% of thresholds at the prior 4-rung
+# 20/40/80/160 sweep — actual saturation knee lies higher.
+{{$saturationQpsListStr := DefaultParam .CL2_SATURATION_QPS_LIST "100,500,1500,4000,10000"}}
+{{$saturationRestartsListStr := DefaultParam .CL2_SATURATION_RESTARTS_LIST "2,4,8,15,25"}}
+{{$saturationRungDurationSeconds := DefaultParam .CL2_SATURATION_RUNG_DURATION_SECONDS 240}}
+{{$saturationSettleSeconds := DefaultParam .CL2_SATURATION_SETTLE_SECONDS 90}}
+
+# Parse comma-separated strings into Go []string slices. StringSplit is
+# CL2's built-in. The arithmetic funcs (AddInt, MultiplyInt, etc.) accept
+# string args and parse them via toFloat64, so we can pass slice elements
+# directly without an atoi step.
+{{$qpsList := StringSplit $saturationQpsListStr}}
+{{$restartsList := StringSplit $saturationRestartsListStr}}
+
+namespace:
+  number: {{$namespaces}}
+  prefix: clustermesh-ub
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+  deleteNamespaceTimeout: 20m
+
+tuningSets:
+  - name: Sequence
+    parallelismLimitedLoad:
+      parallelismLimit: 1
+  # Initial workload-create QPS is fixed at the first rung's QPS — every
+  # cluster brings the baseline workload up at the gentle rung-0 rate so
+  # the create-flurry doesn't itself trip saturation before the ramp
+  # starts. Saturation rungs use their own per-rung TuningSets defined
+  # below.
+  - name: WorkloadCreateQps
+    qpsLoad:
+      qps: {{index $qpsList 0}}
+  # One TuningSet per rung. CL2 template ranges over $qpsList and emits
+  # Rung0Qps, Rung1Qps, ... TuningSets that the workload module references
+  # by name via the matching $tuningSet param below.
+  {{range $i, $qps := $qpsList}}
+  - name: Rung{{$i}}Qps
+    qpsLoad:
+      qps: {{$qps}}
+  {{end}}
+
+steps:
+  # ----- ACNS namespace opt-in (CFP-39876) -----
+  # Identical to event-throughput.yaml — required for cross-cluster sync
+  # to fire at all. See plan.md note #14.
+  - name: Annotate workload namespaces for ACNS cross-cluster sync
+    measurements:
+      - Identifier: AnnotateNamespacesForGlobalSync
+        Method: Exec
+        Params:
+          streamOutput: true
+          timeout: 1m
+          command:
+          - bash
+          - /root/perf-tests/clusterloader2/config/annotate-namespaces.sh
+          - "{{$namespaces}}"
+          - "clustermesh-ub"
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: create
+        tuningSet: WorkloadCreateQps
+
+  # ----- Baseline workload create -----
+  # Done OUTSIDE the rung loop so the create cost (which depends on
+  # cluster cold-start, image pulls, scheduling) isn't conflated with
+  # rung-0's restart-burst signal. After create, every rung exercises
+  # the same population of Deployments via restart bursts.
+  - module:
+      path: /modules/event-throughput-workload.yaml
+      params:
+        actionName: create
+        generation: 0
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        replicasPerDeployment: {{$replicasPerDeployment}}
+        tuningSet: WorkloadCreateQps
+        operationTimeout: {{$operationTimeout}}
+        phaseSuffix: Create
+
+  # 30s pre-rung settle: lets the create-flurry's residual kvstore traffic
+  # drain before rung 0 starts measuring. Without this, rung 0's baseline
+  # carries spillover from the create burst and looks artificially loaded.
+  - name: Pre-rung settle (drain create-flurry)
+    measurements:
+      - Identifier: PreRungSettle
+        Method: Sleep
+        Params:
+          duration: 30s
+
+  # ----- Saturation rung loop -----
+  # Each rung: start measurements with Rung<i> suffix → restart-burst the
+  # workload restartsList[i] times at qpsList[i] QPS → sleep rung duration
+  # so the gather window captures both burst and tail → gather measurements
+  # → settle before next rung.
+  #
+  # Restart generations are offset per rung by 1000*(rung+1) so the
+  # pod-template annotation values are strictly monotonic across rungs
+  # (avoids a rollout being skipped because the same generation was used
+  # in a prior rung).
+  {{range $i, $qps := $qpsList}}
+
+  # ===== Rung {{$i}} (qps={{$qps}}, restarts={{index $restartsList $i}}) =====
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: start
+        group: clustermesh-upper-bound-rung{{$i}}
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: start
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: start
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: start
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: start
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: start
+        suffix: Rung{{$i}}
+
+  # Rung {{$i}} workload: restart-burst the population N times. Each
+  # restart bumps the pod-template annotation to a unique generation so
+  # the rolling-recreate fires. Generation = 1000*(rung+1) + r so cross-
+  # rung values never collide.
+  {{range $r := Loop (index $restartsList $i)}}
+  - module:
+      path: /modules/event-throughput-workload.yaml
+      params:
+        actionName: restart
+        generation: {{AddInt (MultiplyInt 1000 (AddInt $i 1)) $r}}
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        replicasPerDeployment: {{$replicasPerDeployment}}
+        tuningSet: Rung{{$i}}Qps
+        operationTimeout: {{$operationTimeout}}
+        phaseSuffix: Rung{{$i}}Restart{{$r}}
+  {{end}}
+
+  # Rung-{{$i}} hold: keep the measurement window open after the burst so
+  # the gather queries capture peak + tail. CL2's %v in queries resolves
+  # to the wall time since the matching `start`, so this Sleep determines
+  # the measurement window width for rung {{$i}}.
+  - name: Rung {{$i}} hold (qps={{$qps}}, restarts={{index $restartsList $i}})
+    measurements:
+      - Identifier: SaturationRung{{$i}}Hold
+        Method: Sleep
+        Params:
+          duration: {{$saturationRungDurationSeconds}}s
+
+  # Gather rung-{{$i}} measurements. The suffix=Rung{{$i}} param threads
+  # through every GenericPrometheusQuery's Identifier and metricName so
+  # the emitted JSONs are uniquely named per rung. scale.py collect reads
+  # them back by matching the Rung<N> suffix.
+  - module:
+      path: /modules/measurements/control-plane.yaml
+      params:
+        action: gather
+        group: clustermesh-upper-bound-rung{{$i}}
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/cilium.yaml
+      params:
+        action: gather
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/clustermesh-metrics.yaml
+      params:
+        action: gather
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/clustermesh-throughput.yaml
+      params:
+        action: gather
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/etcd-metrics.yaml
+      params:
+        action: gather
+        suffix: Rung{{$i}}
+
+  - module:
+      path: /modules/measurements/pod-churn-stress.yaml
+      params:
+        action: gather
+        suffix: Rung{{$i}}
+
+  # Inter-rung settle: drain kvstore queues from rung {{$i}} before the
+  # next rung starts. Without this, the next rung's baseline carries
+  # rung-{{$i}}'s spillover. 60s is enough at low rungs; at the highest
+  # rungs the spillover may exceed settle and the next rung's verdict
+  # will be biased "worse" — that's fine, it captures cumulative system
+  # stress correctly.
+  - name: Rung {{$i}} settle
+    measurements:
+      - Identifier: SaturationRung{{$i}}Settle
+        Method: Sleep
+        Params:
+          duration: {{$saturationSettleSeconds}}s
+
+  {{end}}
+  # ----- end of rung loop -----
+
+  # ----- Workload + PodMonitor teardown -----
+  # Use a generation strictly greater than any rung's max generation
+  # (1000 * (max_rung+1) + max_restart_in_that_rung) so the delete-time
+  # pod-template doesn't accidentally match a prior rung's template
+  # and skip the rolling cleanup. With defaults (4 rungs × max 4 restarts)
+  # max rung generation = 1000*4 + 3 = 4003; we use 999999 which is well
+  # above any plausible matrix-configured value.
+  - module:
+      path: /modules/event-throughput-workload.yaml
+      params:
+        actionName: delete
+        generation: 999999
+        namespaces: {{$namespaces}}
+        deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+        replicasPerDeployment: {{$replicasPerDeployment}}
+        tuningSet: WorkloadCreateQps
+        operationTimeout: {{$operationTimeout}}
+        phaseSuffix: Delete
+
+  - module:
+      path: /modules/clustermesh.yaml
+      params:
+        actionName: delete
+        tuningSet: WorkloadCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/write-ready-sentinel.sh b/modules/python/clusterloader2/clustermesh-scale/config/write-ready-sentinel.sh
new file mode 100755
index 0000000000..a020aad9d6
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/write-ready-sentinel.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# CL2 ready-sentinel writer for Scenario #3 (Node Churn / IP Churn).
+#
+# Why a separate script and not inline `bash -c` in the CL2 yaml:
+# The first iteration used `command: [bash, -c, |<inline>]` in the CL2
+# Method:Exec block, with `CTX=$(kubectl config current-context)`. Build
+# 67114 showed `kubectl config current-context` returning EMPTY in the CL2
+# docker image's environment (verified by `Exec command output: wrote
+# sentinel ready-` — context suffix was empty). Both clusters then wrote
+# the SAME path (sentinels/ready-) and one overwrote the other → barrier
+# saw 1/2 sentinels → quorum never reached → scenario aborted.
+#
+# This script is mounted into the CL2 container at
+# /root/perf-tests/clusterloader2/config/write-ready-sentinel.sh by virtue
+# of being a sibling of pod-churn-killer.sh / annotate-namespaces.sh /
+# apiserver-failure-killer.sh (the CL2_CONFIG_DIR bind-mount). Same
+# pattern, proven across scenarios #2/#4/#5/#7.
+#
+# Context-name resolution (multi-fallback for robustness):
+#   1. Parse `current-context:` from /root/.kube/config directly (the
+#      file is bind-mounted by run_cl2_command from the host's per-cluster
+#      kubeconfig). YAML-safe grep + awk; no kubectl dependency.
+#   2. `kubectl config current-context` via PATH kubectl.
+#   3. Pre-staged kubectl at /root/perf-tests/clusterloader2/config/kubectl.
+#   4. Hash of the kubeconfig server URL — guaranteed unique across
+#      clusters in this mesh (different AKS APIServer URLs).
+#   5. Hostname of the pod (CL2 pods get pod-name-suffixed). Last resort.
+#
+# All diagnostic output goes to STDERR so CL2 streamOutput captures it for
+# postmortem. STDOUT only emits the final sentinel path.
+#
+# Positional args:
+#   $1 SENTINEL_DIR   (required) absolute path; sentinel file lands here
+
+set -uo pipefail
+
+SENTINEL_DIR="${1:?sentinel dir required}"
+mkdir -p "$SENTINEL_DIR"
+
+KUBECONFIG_PATH="${KUBECONFIG:-/root/.kube/config}"
+PRE_STAGED_KUBECTL="/root/perf-tests/clusterloader2/config/kubectl"
+
+dbg() {
+  # Diagnostic logging to stderr — captured by CL2 streamOutput.
+  echo "write-ready-sentinel: $*" >&2
+}
+
+CTX=""
+RESOLVED_BY=""
+
+# Method 1: parse kubeconfig directly.
+if [ -f "$KUBECONFIG_PATH" ]; then
+  CTX=$(grep -E '^current-context:' "$KUBECONFIG_PATH" 2>/dev/null \
+    | head -1 | awk '{print $2}' | tr -d '"' | tr -d "'" || echo "")
+  if [ -n "$CTX" ]; then
+    RESOLVED_BY="kubeconfig-parse"
+  fi
+fi
+
+# Method 2: PATH kubectl.
+if [ -z "$CTX" ] && command -v kubectl >/dev/null 2>&1; then
+  CTX=$(kubectl config current-context 2>/dev/null || echo "")
+  if [ -n "$CTX" ]; then
+    RESOLVED_BY="kubectl-PATH"
+  fi
+fi
+
+# Method 3: pre-staged kubectl.
+if [ -z "$CTX" ] && [ -x "$PRE_STAGED_KUBECTL" ]; then
+  CTX=$("$PRE_STAGED_KUBECTL" config current-context 2>/dev/null || echo "")
+  if [ -n "$CTX" ]; then
+    RESOLVED_BY="kubectl-prestaged"
+  fi
+fi
+
+# Method 4: hash of server URL (deterministic per cluster; collision-safe
+# across the mesh because every AKS has a unique FQDN).
+if [ -z "$CTX" ] && [ -f "$KUBECONFIG_PATH" ]; then
+  _server=$(grep -E '^\s*server:' "$KUBECONFIG_PATH" 2>/dev/null | head -1 \
+    | awk '{print $2}' || echo "")
+  if [ -n "$_server" ]; then
+    if command -v sha256sum >/dev/null 2>&1; then
+      _hash=$(echo -n "$_server" | sha256sum | cut -c1-8)
+    elif command -v md5sum >/dev/null 2>&1; then
+      _hash=$(echo -n "$_server" | md5sum | cut -c1-8)
+    else
+      _hash=$(echo -n "$_server" | od -A n -t x1 | tr -d ' \n' | cut -c1-8)
+    fi
+    CTX="srv-${_hash}"
+    RESOLVED_BY="server-hash"
+  fi
+fi
+
+# Method 5: pod hostname (CL2 runs each cluster's CL2 in a separate
+# docker container with a unique hostname).
+if [ -z "$CTX" ]; then
+  CTX="$(hostname 2>/dev/null || echo "unknown-$$")"
+  RESOLVED_BY="hostname"
+fi
+
+# DIAGNOSTIC DUMP — always print state so postmortem on quorum failure
+# can identify why context was hard to resolve.
+dbg "===== CL2 ready-sentinel diagnostic ====="
+dbg "resolved context = '${CTX}' via ${RESOLVED_BY}"
+dbg "KUBECONFIG=${KUBECONFIG_PATH} exists=$( [ -f "$KUBECONFIG_PATH" ] && echo yes || echo no )"
+if [ -f "$KUBECONFIG_PATH" ]; then
+  dbg "kubeconfig current-context line: $(grep -E '^current-context:' "$KUBECONFIG_PATH" | head -1 || echo '(none)')"
+  dbg "kubeconfig server line: $(grep -E '^\s*server:' "$KUBECONFIG_PATH" | head -1 || echo '(none)')"
+fi
+dbg "PATH=${PATH:-}"
+dbg "PATH kubectl: $(command -v kubectl || echo '(none)')"
+dbg "pre-staged kubectl exists+exec: $( [ -x "$PRE_STAGED_KUBECTL" ] && echo yes || echo no )"
+dbg "hostname: $(hostname 2>/dev/null || echo '(none)')"
+dbg "sentinel dir: ${SENTINEL_DIR}"
+dbg "================================================"
+
+# Guard: empty context after every fallback would still cause a path
+# collision. Emit a unique fallback name using $$ (PID, unique-per-process).
+if [ -z "$CTX" ]; then
+  CTX="unresolved-$$"
+  dbg "ERROR: every fallback returned empty; using ${CTX}"
+fi
+
+SENTINEL_FILE="${SENTINEL_DIR}/ready-${CTX}"
+touch "$SENTINEL_FILE"
+dbg "wrote sentinel ${SENTINEL_FILE}"
+echo "$SENTINEL_FILE"
+exit 0
diff --git a/modules/python/clusterloader2/clustermesh-scale/scale.py b/modules/python/clusterloader2/clustermesh-scale/scale.py
index 35047f122a..56c623083d 100644
--- a/modules/python/clusterloader2/clustermesh-scale/scale.py
+++ b/modules/python/clusterloader2/clustermesh-scale/scale.py
@@ -1,11 +1,17 @@
 """
 ClusterMesh scale-test harness.
 
-Single-cluster invocation. The Telescope pipeline fans out by calling this
-script once per fleet member (driven by `az fleet clustermeshprofile list-members`
-in steps/topology/clustermesh-scale/execute-clusterloader2.yml). Each invocation
-emits one JSONL with a `cluster` attribution column so concatenated results from
-N clusters are queryable per-cluster downstream.
+Per-cluster execute (`scale.py execute`) is single-cluster: it spawns one
+ClusterLoader2 docker container against one kubeconfig. The Telescope pipeline
+fans out across N clusters; each per-cluster invocation emits one JSONL with a
+`cluster` attribution column so concatenated results from N clusters are
+queryable per-cluster downstream.
+
+Multi-cluster fan-out (`scale.py execute-parallel`, Phase 3) bounds parallel
+CL2 invocations across the mesh — see `execute_parallel` below for the worker
+model. Each parallel worker shells out to `run-cl2-on-cluster.sh` so the
+existing per-iteration bash semantics (CL2 run + junit gate + log capture +
+failure diag) are preserved exactly per cluster.
 
 Phase 1 is intentionally trivial: deploy a small fixed number of pods, no churn,
 no fortio, no network policies. The goal of Phase 1 is to prove the multi-cluster
@@ -15,19 +21,91 @@
 parameters to configure/collect.
 """
 import argparse
+import concurrent.futures
 import json
 import os
+import signal
+import subprocess
+import sys
+import tempfile
+import threading
 from datetime import datetime, timezone
 
 from clusterloader2.utils import parse_xml_to_json, run_cl2_command, process_cl2_reports
 
 
+# Phase 4b — Scenario #6 (Upper Bound / Saturation) classifier constants.
+# Versioned so downstream Kusto dashboards can compare verdicts across
+# tuning iterations. Raw signal values + thresholds are emitted alongside
+# the verdict so dashboards can recompute verdicts post-hoc without re-
+# running the test if thresholds need calibration.
+#
+# Thresholds rationale (v1 — first-smoke calibration; revisit after first
+# n=2 green):
+#   latency_p99_ms          — 500ms p99 of cilium_kvstoremesh_kvstore_
+#                             operations_duration. Healthy AKS-managed
+#                             Cilium runs show p99 < 100ms; 5× that is
+#                             the saturation knee.
+#   queue_size_perc99       — 1000 in cilium_kvstoremesh_kvstore_sync_
+#                             queue_size. Steady-state on green pod-churn
+#                             runs is single digits; 3 orders of magnitude
+#                             above noise floor is unambiguously bad.
+#   apiserver_max_cpu_cores — 1.5 cores per clustermesh-apiserver pod
+#                             (ClusterMeshApiserverPodCPU PerPodMax).
+#                             AKS-managed Cilium typically requests
+#                             0.5-1.0 vCPU; saturated >2× allocation = at
+#                             risk of throttling.
+#   mesh_failure_rate_max   — 0.5 reconnect-failures/s. Plan.md deferred
+#                             decision #6 documents the green-run
+#                             baseline of 4-6 reconnects per 36 min run
+#                             ≈ 0.003/s (uniformly distributed across
+#                             peers, benign Fleet churn). 0.5/s = ~150×
+#                             that baseline → real failure burst.
+#   etcd_commit_p99_ms      — 200ms p99 of etcd_debugging_disk_backend_
+#                             commit_write_duration. Etcd's design target
+#                             is single-digit ms; 200ms = backed-up disk
+#                             subsystem.
+SATURATION_CLASSIFIER_VERSION = "saturation-v1"
+SATURATION_THRESHOLDS = {
+    "latency_p99_ms": 500.0,
+    "queue_size_perc99": 1000.0,
+    "apiserver_max_cpu_cores": 1.5,
+    "mesh_failure_rate_max": 0.5,
+    "etcd_commit_p99_ms": 200.0,
+}
+
+
 def configure_clusterloader2(
     namespaces,
     deployments_per_namespace,
     replicas_per_deployment,
     operation_timeout,
     override_file,
+    churn_cycles=5,
+    churn_up_duration="60s",
+    churn_down_duration="60s",
+    kill_duration="10m",
+    kill_interval_seconds=10,
+    kill_batch=5,
+    kill_duration_seconds=600,
+    kill_job_deadline_seconds=660,
+    apiserver_kill_target_context="clustermesh-1",
+    apiserver_kill_recovery_timeout_seconds=240,
+    apiserver_kill_observation_seconds=60,
+    ha_config_replicas=3,
+    node_churn_target_context="clustermesh-1",
+    node_churn_cycles=3,
+    node_churn_delta=5,
+    node_churn_settle_seconds=60,
+    node_churn_scale_duration_seconds=1800,
+    node_churn_replace_duration_seconds=1500,
+    node_churn_combined_duration_seconds=3300,
+    node_replace_batch_size=10,
+    node_churn_ready_timeout_seconds=300,
+    saturation_qps_list="100,500,1500,4000,10000",
+    saturation_restarts_list="2,4,8,15,25",
+    saturation_rung_duration_seconds=240,
+    saturation_settle_seconds=90,
 ):
     with open(override_file, "w", encoding="utf-8") as f:
         # Prometheus stack — keep the Cilium-scrape flags ON so the
@@ -38,7 +116,16 @@ def configure_clusterloader2(
         # IS honored as an overrides key and must be >= the request to satisfy
         # k8s admission.
         f.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n")
-        f.write("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi\n")
+        # Prometheus memory limit. Bumped 2Gi\u21924Gi 2026-05-15 after build
+        # 67224 showed prometheus-k8s-0 in CrashLoopBackOff on saturation
+        # runs. Then bumped 4Gi\u219212Gi 2026-05-15 after build 67279
+        # showed Prom STILL OOM'ing at Rung 2 even with 4Gi when the
+        # restart-burst workload pushed too many series/samples.
+        # D8ds_v4 prompool has 32GB RAM so 12Gi is safe with headroom.
+        # CL2_PROMETHEUS_MEMORY_LIMIT is honored as a CL2 overrides key
+        # (unlike the *_FACTOR knobs which are silently broken — see
+        # plan.md "What we built" item 16).
+        f.write("CL2_PROMETHEUS_MEMORY_LIMIT: 12Gi\n")
         # Pin Prometheus to the dedicated `prompool` node (label
         # prometheus=true is set in azure-2.tfvars extra_node_pool). Without
         # this, prometheus-k8s lands on the default workload pool and
@@ -62,6 +149,65 @@ def configure_clusterloader2(
         f.write(f"CL2_REPLICAS_PER_DEPLOYMENT: {replicas_per_deployment}\n")
         f.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n")
 
+        # Phase 4a — Scenario #2 (Pod Churn Stress) knobs.
+        # Written unconditionally with defaults so an event-throughput run
+        # (which doesn't reference these CL2_* params in its template)
+        # silently ignores them. CL2 does not fail on unknown overrides
+        # keys, so the cost is a few lines of YAML noise per non-churn run.
+        # The alternative — splitting configure into per-scenario
+        # subcommands — would proliferate harness surface area; see
+        # plan.md Phase 4a notes.
+        f.write(f"CL2_CHURN_CYCLES: {churn_cycles}\n")
+        f.write(f"CL2_CHURN_UP_DURATION: {churn_up_duration}\n")
+        f.write(f"CL2_CHURN_DOWN_DURATION: {churn_down_duration}\n")
+        f.write(f"CL2_KILL_DURATION: {kill_duration}\n")
+        f.write(f"CL2_KILL_INTERVAL_SECONDS: {kill_interval_seconds}\n")
+        f.write(f"CL2_KILL_BATCH: {kill_batch}\n")
+        f.write(f"CL2_KILL_DURATION_SECONDS: {kill_duration_seconds}\n")
+        f.write(f"CL2_KILL_JOB_DEADLINE_SECONDS: {kill_job_deadline_seconds}\n")
+
+        # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs.
+        # Same unconditional-write pattern as the pod-churn knobs above:
+        # CL2 templates that don't reference these silently ignore. Allows
+        # share-infra runs where multiple scenarios share one overrides.yaml.
+        f.write(f"CL2_APISERVER_KILL_TARGET_CONTEXT: {apiserver_kill_target_context}\n")
+        f.write(f"CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS: {apiserver_kill_recovery_timeout_seconds}\n")
+        f.write(f"CL2_APISERVER_KILL_OBSERVATION_SECONDS: {apiserver_kill_observation_seconds}\n")
+
+        # Phase 4b — Scenario #7 (HA Configuration Validation) knob.
+        # Single replicas-count override consumed by ha-config.yaml. Other
+        # scenarios' CL2 configs don't reference it; ignored silently.
+        f.write(f"CL2_HA_CONFIG_REPLICAS: {ha_config_replicas}\n")
+
+        # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs.
+        # node-churn-{scale,replace,combined}.yaml each consume a subset.
+        # node-churner.sh (driven from execute.yml, NOT Method:Exec — CL2
+        # image has no az CLI) reads the same matrix vars directly; these
+        # overrides drive the CL2-side sleep/sentinel window that aligns
+        # with the churner's wall-clock run.
+        f.write(f"CL2_NODE_CHURN_TARGET_CONTEXT: {node_churn_target_context}\n")
+        f.write(f"CL2_NODE_CHURN_CYCLES: {node_churn_cycles}\n")
+        f.write(f"CL2_NODE_CHURN_DELTA: {node_churn_delta}\n")
+        f.write(f"CL2_NODE_CHURN_SETTLE_SECONDS: {node_churn_settle_seconds}\n")
+        f.write(f"CL2_NODE_CHURN_SCALE_DURATION_SECONDS: {node_churn_scale_duration_seconds}\n")
+        f.write(f"CL2_NODE_CHURN_REPLACE_DURATION_SECONDS: {node_churn_replace_duration_seconds}\n")
+        f.write(f"CL2_NODE_CHURN_COMBINED_DURATION_SECONDS: {node_churn_combined_duration_seconds}\n")
+        f.write(f"CL2_NODE_REPLACE_BATCH_SIZE: {node_replace_batch_size}\n")
+        f.write(f"CL2_NODE_CHURN_READY_TIMEOUT_SECONDS: {node_churn_ready_timeout_seconds}\n")
+
+        # Phase 4b — Scenario #6 (Upper Bound / Saturation) knobs.
+        # upper-bound.yaml CL2 config consumes these to drive the per-rung
+        # QPS ramp + restart amplitude. Written unconditionally with the
+        # same defaulted-pattern as scenario #2-#5 knobs: non-saturation
+        # CL2 configs simply ignore them (CL2 doesn't fail on unknown
+        # overrides keys). The qps and restarts lists are written as
+        # comma-separated strings; upper-bound.yaml uses CL2's
+        # StringSplit template func to parse.
+        f.write(f"CL2_SATURATION_QPS_LIST: \"{saturation_qps_list}\"\n")
+        f.write(f"CL2_SATURATION_RESTARTS_LIST: \"{saturation_restarts_list}\"\n")
+        f.write(f"CL2_SATURATION_RUNG_DURATION_SECONDS: {saturation_rung_duration_seconds}\n")
+        f.write(f"CL2_SATURATION_SETTLE_SECONDS: {saturation_settle_seconds}\n")
+
     with open(override_file, "r", encoding="utf-8") as f:
         print(f"Content of file {override_file}:\n{f.read()}")
 
@@ -73,6 +219,7 @@ def execute_clusterloader2(
     cl2_config_file,
     kubeconfig,
     provider,
+    tear_down_prometheus=False,
 ):
     run_cl2_command(
         kubeconfig,
@@ -83,7 +230,13 @@ def execute_clusterloader2(
         cl2_config_file=cl2_config_file,
         overrides=True,
         enable_prometheus=True,
-        tear_down_prometheus=False,
+        # Default False preserves the diagnostic-on-failure capability — when
+        # CL2 fails, run-cl2-on-cluster.sh's FAILURE DIAG block can dump
+        # prometheus-operator + prometheus-k8s pod logs. Set True in
+        # share-infra mode (multi-scenario per lifecycle) so each scenario's
+        # CL2 invocation gets a clean Prometheus deploy and the previous
+        # scenario's PodMonitor/scrape config doesn't bleed in.
+        tear_down_prometheus=tear_down_prometheus,
         scrape_kubelets=True,
         scrape_ksm=True,
         scrape_metrics_server=True,
@@ -97,6 +250,228 @@ def execute_clusterloader2(
     )
 
 
+# Module-level lock + Popen tracking for execute_parallel. Lock keeps log lines
+# atomic across worker threads; the Popen list lets a SIGINT/SIGTERM handler
+# terminate live children on cancel (AzDO step cancel, Ctrl-C in dev).
+_PARALLEL_STDOUT_LOCK = threading.Lock()
+_PARALLEL_LIVE_POPENS = []
+_PARALLEL_LIVE_POPENS_LOCK = threading.Lock()
+
+
+def _emit_prefixed_line(role, line):
+    # AzDO recognizes ##vso[...] service messages only when they appear at
+    # column 0 — prefixing them would drop the structured annotation. Emit
+    # those unprefixed; everything else gets the [role] tag for readability
+    # under interleaved output.
+    if line.startswith("##"):
+        out = line
+    else:
+        out = f"[{role}] {line}"
+    with _PARALLEL_STDOUT_LOCK:
+        sys.stdout.write(out)
+        sys.stdout.flush()
+
+
+def _run_one_cluster(role, worker_script, worker_args, env=None):
+    """Spawn the per-cluster worker script and stream its merged stdout/stderr.
+
+    Returns (role, exit_code). Exit code is the worker script's exit (which
+    is the authoritative pass/fail per cluster — the script does its own
+    junit gate + log capture + failure diag).
+    """
+    cmd = ["bash", worker_script, role, *worker_args]
+    # bufsize=1 + text=True gives us line-buffered text reads so the prefix
+    # writer sees one CL2 log line at a time. PYTHONUNBUFFERED ensures the
+    # nested python3 scale.py execute child also flushes per-line.
+    child_env = os.environ.copy()
+    if env:
+        child_env.update(env)
+    child_env.setdefault("PYTHONUNBUFFERED", "1")
+    # Not using `with subprocess.Popen(...)` because the Popen handle is
+    # registered in _PARALLEL_LIVE_POPENS for the SIGINT/SIGTERM handler;
+    # `with` would close stdout at function exit and cancel signal-based
+    # termination semantics. The try/finally below handles cleanup.
+    proc = subprocess.Popen(  # pylint: disable=consider-using-with
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        bufsize=1,
+        text=True,
+        env=child_env,
+    )
+    with _PARALLEL_LIVE_POPENS_LOCK:
+        _PARALLEL_LIVE_POPENS.append(proc)
+    try:
+        assert proc.stdout is not None
+        for line in proc.stdout:
+            _emit_prefixed_line(role, line)
+        proc.wait()
+    finally:
+        with _PARALLEL_LIVE_POPENS_LOCK:
+            try:
+                _PARALLEL_LIVE_POPENS.remove(proc)
+            except ValueError:
+                pass
+    return role, proc.returncode
+
+
+def _install_parallel_signal_handlers():
+    """Terminate live worker subprocesses on SIGINT/SIGTERM.
+
+    AzDO step cancel sends SIGTERM. ThreadPoolExecutor will not reap child
+    processes spawned by its workers, and each worker bash script in turn
+    spawns `python3 scale.py execute` which spawns a docker container — so
+    abrupt parent death without explicit teardown can leave orphan docker
+    containers running. We best-effort terminate the bash workers; the docker
+    container behind them will exit when its parent python child exits.
+    """
+    def _terminate_all(signum, _frame):
+        with _PARALLEL_STDOUT_LOCK:
+            sys.stdout.write(
+                f"[execute-parallel] received signal {signum}, "
+                "terminating live workers\n"
+            )
+            sys.stdout.flush()
+        with _PARALLEL_LIVE_POPENS_LOCK:
+            for proc in list(_PARALLEL_LIVE_POPENS):
+                try:
+                    proc.terminate()
+                except Exception:  # pylint: disable=broad-except
+                    pass
+        # Re-raise default behavior for the original signal so the parent
+        # exits with the conventional code (128+signum). This also unblocks
+        # any executor.shutdown(wait=True) waiters.
+        signal.signal(signum, signal.SIG_DFL)
+        os.kill(os.getpid(), signum)
+
+    signal.signal(signal.SIGINT, _terminate_all)
+    signal.signal(signal.SIGTERM, _terminate_all)
+
+
+def execute_parallel(
+    clusters_file,
+    max_concurrent,
+    worker_script,
+    cl2_image,
+    cl2_config_dir,
+    cl2_config_file,
+    cl2_report_dir_base,
+    provider,
+    python_script_file,
+    python_workdir,
+    tear_down_prometheus=False,
+):
+    """Fan out CL2 across N clusters with bounded concurrency.
+
+    Each cluster's CL2 + log capture + failure diag runs in its own bash
+    worker process (run-cl2-on-cluster.sh). At most `max_concurrent` run
+    in parallel. Per-cluster log capture happens IMMEDIATELY when that
+    cluster's CL2 finishes — before peer clusters complete — so kubectl
+    --tail windows and `kubectl get events` recency don't age out.
+
+    The worker script's exit code is the authoritative per-cluster
+    pass/fail (it does its own junit gate). This function aggregates:
+    returns 0 iff every worker exited 0; otherwise 1. Matches the
+    sequential `if failures > 0; exit 1` semantics that execute.yml had
+    before parallelization, so the AzDO step's pass/fail signal is
+    unchanged from the user's perspective.
+
+    `clusters_file` schema: a JSON array of objects with at least `role`
+    and `kubeconfig` fields. Extra fields (e.g. `name`, `rg`) are ignored
+    so the same JSON file produced by execute.yml's discovery step (which
+    also feeds collect.yml) can be reused without a separate write.
+
+    Known concurrency risk: `run_cl2_command` mounts `~/.azure` rw into
+    every CL2 docker container (utils.py:69-70). At max_concurrent > 1
+    those containers concurrently read/write the MSAL token cache. If
+    this causes auth flakes on real 5/10/20-cluster runs, isolate per
+    worker (TODO Phase 3 follow-up).
+    """
+    with open(clusters_file, "r", encoding="utf-8") as f:
+        clusters = json.load(f)
+    if not isinstance(clusters, list) or not clusters:
+        raise ValueError(
+            f"clusters file {clusters_file} must be a non-empty JSON array"
+        )
+
+    # Validate up front so we fail fast before spawning anything.
+    for idx, c in enumerate(clusters):
+        if "role" not in c or "kubeconfig" not in c:
+            raise ValueError(
+                f"clusters[{idx}] missing 'role' or 'kubeconfig': {c}"
+            )
+
+    if max_concurrent < 1:
+        raise ValueError(f"max_concurrent must be >= 1, got {max_concurrent}")
+
+    _install_parallel_signal_handlers()
+
+    print(
+        f"[execute-parallel] dispatching {len(clusters)} cluster(s) "
+        f"with max_concurrent={max_concurrent}",
+        flush=True,
+    )
+
+    results = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=max_concurrent
+    ) as executor:
+        futures = {}
+        for c in clusters:
+            role = c["role"]
+            kubeconfig = c["kubeconfig"]
+            report_dir = os.path.join(cl2_report_dir_base, role)
+            worker_args = [
+                kubeconfig,
+                report_dir,
+                cl2_image,
+                cl2_config_dir,
+                cl2_config_file,
+                provider,
+                python_script_file,
+                python_workdir,
+                # Last positional: 1 = tear down Prometheus at end of CL2 (used
+                # by share-infra mode so the next scenario's CL2 deploys a
+                # fresh Prom); 0 = preserve Prom for failure-diagnostic dump.
+                "1" if tear_down_prometheus else "0",
+            ]
+            fut = executor.submit(
+                _run_one_cluster, role, worker_script, worker_args
+            )
+            futures[fut] = role
+
+        for fut in concurrent.futures.as_completed(futures):
+            role = futures[fut]
+            try:
+                _, exit_code = fut.result()
+            except Exception as e:  # pylint: disable=broad-except
+                # Worker raised before producing an exit code (e.g. could not
+                # spawn bash). Treat as a failure for that cluster — surface
+                # the error and continue collecting peers.
+                print(
+                    f"[execute-parallel] {role}: worker raised: {e}",
+                    flush=True,
+                )
+                results.append((role, 1))
+            else:
+                results.append((role, exit_code))
+
+    failed = [r for r, code in results if code != 0]
+    succeeded = [r for r, code in results if code == 0]
+    print(
+        f"[execute-parallel] summary: {len(succeeded)} succeeded, "
+        f"{len(failed)} failed (max_concurrent={max_concurrent})",
+        flush=True,
+    )
+    if failed:
+        print(
+            f"[execute-parallel] failed clusters: {', '.join(sorted(failed))}",
+            flush=True,
+        )
+        return 1
+    return 0
+
+
 def collect_clusterloader2(
     cl2_report_dir,
     cloud_info,
@@ -112,6 +487,14 @@ def collect_clusterloader2(
     deployments_per_namespace,
     replicas_per_deployment,
     trigger_reason="",
+    churn_cycles=0,
+    churn_up_duration="",
+    churn_down_duration="",
+    kill_duration_seconds=0,
+    kill_interval_seconds=0,
+    kill_batch=0,
+    saturation_qps_list="",
+    saturation_restarts_list="",
 ):
     details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent=2)
     json_data = json.loads(details)
@@ -145,6 +528,17 @@ def collect_clusterloader2(
             "deployments_per_namespace": deployments_per_namespace,
             "replicas_per_deployment": replicas_per_deployment,
             "pods_per_cluster": namespaces * deployments_per_namespace * replicas_per_deployment,
+            # Phase 4a — pod-churn knobs. Defaults are 0/"" for non-churn
+            # test_types so existing Kusto queries that don't reference
+            # these fields stay valid. For pod-churn runs these record the
+            # exact stressor parameters so historical comparisons survive
+            # default changes.
+            "churn_cycles": churn_cycles,
+            "churn_up_duration": churn_up_duration,
+            "churn_down_duration": churn_down_duration,
+            "kill_duration_seconds": kill_duration_seconds,
+            "kill_interval_seconds": kill_interval_seconds,
+            "kill_batch": kill_batch,
             "details": (
                 testsuites[0]["testcases"][0].get("failure", None)
                 if testsuites[0].get("testcases")
@@ -163,13 +557,720 @@ def collect_clusterloader2(
         "namespaces": namespaces,
         "deployments_per_namespace": deployments_per_namespace,
         "replicas_per_deployment": replicas_per_deployment,
+        "churn_cycles": churn_cycles,
+        "kill_duration_seconds": kill_duration_seconds,
+        "kill_interval_seconds": kill_interval_seconds,
+        "kill_batch": kill_batch,
     }
-    content = process_cl2_reports(cl2_report_dir, template)
+    # Shared process_cl2_reports() does an unconditional open() on every
+    # entry of cl2_report_dir, which raises IsADirectoryError on any subdir.
+    # Today the only subdir is logs/ (created by run-cl2-on-cluster.sh for
+    # pod-log capture), but we stash ANY subdir so future additions (new
+    # diag dumps, CL2 version bump emitting per-phase subdirs, etc.) don't
+    # silently regress. Subdirs are relocated OUTSIDE cl2_report_dir for
+    # the duration of the parse and restored in a finally block — they
+    # must end up back inside cl2_report_dir so the pipeline-level
+    # artifact publish picks them up alongside junit.xml.
+    stash_root = None
+    stashed_entries = []
+    for entry in os.listdir(cl2_report_dir):
+        if os.path.isdir(os.path.join(cl2_report_dir, entry)):
+            if stash_root is None:
+                stash_root = tempfile.mkdtemp(prefix="cl2-report-stash-")
+            os.rename(
+                os.path.join(cl2_report_dir, entry),
+                os.path.join(stash_root, entry),
+            )
+            stashed_entries.append(entry)
+    try:
+        content = process_cl2_reports(cl2_report_dir, template)
+    finally:
+        if stash_root:
+            for entry in stashed_entries:
+                src = os.path.join(stash_root, entry)
+                if os.path.isdir(src):
+                    os.rename(src, os.path.join(cl2_report_dir, entry))
+            if not os.listdir(stash_root):
+                os.rmdir(stash_root)
 
     os.makedirs(os.path.dirname(result_file), exist_ok=True)
     with open(result_file, "w", encoding="utf-8") as f:
         f.write(content)
 
+    # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) timing pickup.
+    # apiserver-failure-killer.sh writes ApiserverFailureTimings_<context>.json
+    # at the target cluster's report dir with t0/t1/duration. Non-target
+    # clusters skip writing the file. process_cl2_reports() doesn't recognize
+    # this file pattern, so we emit the row explicitly here. One row per
+    # timing file (always exactly one — only the target cluster writes one).
+    _emit_apiserver_failure_timing_rows(cl2_report_dir, template, result_file)
+
+    # Phase 4b — Scenario #7 (HA Configuration Validation) scaling pickup.
+    # ha-config-scaler.sh writes HAConfigScalingTimings_<context>.json on
+    # EVERY cluster (not just the kill target) — HA scaling is mesh-wide.
+    # One row per cluster.
+    _emit_ha_config_scaling_rows(cl2_report_dir, template, result_file)
+
+    # Phase 4b — Scenario #3 (Node Churn / IP Churn) timing pickup.
+    # node-churner.sh writes NodeChurnTimings_<target_context>.json into the
+    # TARGET cluster's per-cluster report dir (the churner runs from
+    # execute.yml on the AzDO agent, not inside CL2 — see plan.md scenario #3
+    # design). One row per recorded op (scale_up / scale_down / replace_drain /
+    # replace_delete / replace_wait). Non-target clusters skip writing the
+    # file → no rows emitted for them.
+    _emit_node_churn_timing_rows(cl2_report_dir, template, result_file)
+
+    # Phase 4b — Scenario #6 (Upper Bound / Saturation) classifier rows.
+    # Reads per-rung GenericPrometheusQuery output JSONs (one per measurement
+    # × rung; CL2 emits them with the rung's suffix in the Identifier and
+    # filename), applies the saturation classifier to each rung, and emits
+    # one SaturationRung row per rung + one SaturationSummary row per
+    # cluster. No-op when saturation_qps_list is empty (i.e. not an
+    # upper-bound test_type) so non-saturation scenarios pay zero overhead.
+    _emit_saturation_profile_rows(
+        cl2_report_dir, template, result_file,
+        saturation_qps_list, saturation_restarts_list,
+    )
+
+
+def _emit_saturation_profile_rows(
+    cl2_report_dir, template, result_file,
+    saturation_qps_list, saturation_restarts_list,
+):
+    """Append SaturationRung + SaturationSummary JSONL rows.
+
+    Reads per-rung GenericPrometheusQuery output JSONs (CL2-emitted, format
+    {"version": "v1", "dataItems": [{"labels": {"Metric": <query_name>},
+    "data": {"value": <number>}}, ...]}) and applies the classifier.
+
+    Args:
+        cl2_report_dir: per-cluster report directory.
+        template: row template (cluster/mesh_size/etc. already filled in).
+        result_file: per-cluster JSONL output path (appended).
+        saturation_qps_list: comma-separated QPS values, one per rung.
+                             Empty string → not an upper-bound run → no-op.
+        saturation_restarts_list: comma-separated restart counts, one per
+                                  rung. Length must match qps_list; if not,
+                                  missing entries default to 1.
+
+    Emitted rows (one per rung + one per cluster summary):
+        SaturationRung: {
+            "rung_index": int,
+            "configured_qps": int,
+            "configured_restarts": int,
+            "classifier_version": str,
+            "thresholds": {<criterion>: float},
+            "verdict": str,  # clean | latency_spike | queue_unbounded |
+                             # cpu_exhaust | mesh_failure_burst | etcd_tail
+            "dominant_signal_ratio": float,
+            "rung_completed": bool,
+            "measurement_missing": [str],
+            "signals": {<name>: float|None},
+            "all_verdicts": {<criterion>: float},  # ratio observed/threshold
+        }
+        SaturationSummary: {
+            "rungs_configured": int,
+            "rungs_completed": int,
+            "max_clean_qps": int|None,  # highest QPS in contiguous clean prefix
+            "first_failure_rung_index": int|None,
+            "first_failure_qps": int|None,
+            "first_failure_mode": str|None,
+            "second_failure_mode": str|None,
+            "classifier_version": str,
+        }
+    """
+    if not saturation_qps_list:
+        return  # Not an upper-bound run; no-op.
+    try:
+        qps_list = [int(x) for x in saturation_qps_list.split(",") if x.strip()]
+    except ValueError as e:
+        print(
+            f"[collect] WARN: malformed saturation_qps_list "
+            f"{saturation_qps_list!r}: {e}; skipping saturation classifier",
+            file=sys.stderr,
+        )
+        return
+    if not qps_list:
+        return
+    try:
+        restarts_list = [
+            int(x) for x in (saturation_restarts_list or "").split(",")
+            if x.strip()
+        ]
+    except ValueError:
+        restarts_list = []
+    # Pad/truncate restarts_list to match qps_list length. Missing entries
+    # default to 1 (the smallest meaningful restart count). Excess entries
+    # are ignored.
+    while len(restarts_list) < len(qps_list):
+        restarts_list.append(1)
+    restarts_list = restarts_list[: len(qps_list)]
+
+    if not os.path.isdir(cl2_report_dir):
+        print(
+            f"[collect] WARN: saturation classifier: report dir "
+            f"{cl2_report_dir} does not exist",
+            file=sys.stderr,
+        )
+        return
+    all_files = os.listdir(cl2_report_dir)
+
+    # Proactive debug: dump the full list of rung-suffixed measurement files
+    # so postmortem doesn't depend on the AzDO step's stdout being preserved.
+    # User direction 2026-05-14: assume failure, keep debug logs baked in
+    # until n=2 + n=20 are green; strip after.
+    #
+    # Match BOTH filename conventions:
+    #   prod:    "GenericPrometheusQuery <metricName>  Rung<N>_<group>_<ts>.json"
+    #            (space between method and metricName; verified build 67211)
+    #   compact: "GenericPrometheusQuery_<MetricName>Rung<N>_<group>_<ts>.json"
+    #            (no spaces; legacy mock convention)
+    # Pre-fix (build 67221) the diagnostic counted only compact-form files,
+    # so we'd see "0 found" even when files DID land via prod-form (the
+    # _find_file lookup correctly accepts both, but the diagnostic was
+    # misleading). Fix: count any GenericPrometheusQuery*.json with Rung<N>
+    # in the name.
+    rung_files_seen = sorted([
+        f for f in all_files
+        if f.startswith("GenericPrometheusQuery")
+        and "Rung" in f
+        and f.endswith(".json")
+    ])
+    print(
+        f"[collect] saturation: classifier starting for "
+        f"qps_list={qps_list} restarts_list={restarts_list}",
+        file=sys.stderr,
+    )
+    print(
+        f"[collect] saturation: cl2_report_dir={cl2_report_dir} "
+        f"total_files_in_dir={len(all_files)} "
+        f"rung_files_matching_pattern={len(rung_files_seen)}",
+        file=sys.stderr,
+    )
+    # Print ALL files (not just rung ones) so if the prefix matcher has any
+    # encoding/whitespace surprise, the raw listing reveals it.
+    for fname in all_files[:30]:
+        print(f"[collect] saturation:   listdir: {fname!r}", file=sys.stderr)
+    if len(all_files) > 30:
+        print(
+            f"[collect] saturation:   ... and {len(all_files) - 30} more",
+            file=sys.stderr,
+        )
+
+    def _read_metric(filepath, metric_label):
+        """Return the numeric `value` for a given Metric label, or None.
+
+        Supports BOTH known CL2 dataItem shapes:
+
+          (A) CL2 GenericPrometheusQuery — one dataItem with all query
+              results as named keys in `data` (verified against build 67224):
+                {"dataItems": [{"data": {"Max": 0, "Perc99": 0.5}, "unit": "#"}]}
+              The metric_label is the query name from the YAML
+              (Max / Perc50 / Perc99 / etc.) and is looked up directly as a
+              dict key inside item.data.
+
+          (B) Legacy / PodStartupLatency-style — one dataItem per metric,
+              with labels.Metric naming the metric and data.value holding
+              the number:
+                {"dataItems": [
+                    {"labels": {"Metric": "Perc99"}, "data": {"value": 0.5}}
+                ]}
+
+        Returns the first match across all dataItems. None if the label
+        isn't present in any item or the file can't be parsed.
+        """
+        try:
+            with open(filepath, "r", encoding="utf-8") as f:
+                data = json.load(f)
+        except (OSError, json.JSONDecodeError) as e:
+            print(
+                f"[collect] WARN: failed to read {filepath}: {e}",
+                file=sys.stderr,
+            )
+            return None
+        for item in data.get("dataItems", []) or []:
+            item_data = item.get("data") or {}
+            # Format A: query name (e.g. "Perc99") is a direct key in
+            # item.data. The value is the scalar number (not a {"value": N}
+            # wrapper). Skip dict-valued entries so we don't accidentally
+            # match a legacy nested structure.
+            if metric_label in item_data and not isinstance(
+                item_data[metric_label], (dict, list)
+            ):
+                val = item_data[metric_label]
+                if val is None or val == "":
+                    return None
+                try:
+                    return float(val)
+                except (TypeError, ValueError):
+                    return None
+            # Format B: labels.Metric carries the query name, data.value
+            # carries the scalar number. Backward-compatible with existing
+            # mock fixtures (PodStartupLatency mock_data).
+            labels = item.get("labels") or {}
+            if labels.get("Metric") == metric_label:
+                val = item_data.get("value")
+                if val is None or val == "":
+                    return None
+                try:
+                    return float(val)
+                except (TypeError, ValueError):
+                    return None
+        return None
+
+    def _find_file(rung_suffix, metric_name_prefix):
+        """Locate the CL2-emitted JSON for a given metricName prefix and
+        rung suffix. CL2's actual file pattern (verified against build 67211)
+        is:
+            GenericPrometheusQuery <metricName with spaces> <suffix>_<group>_<timestamp>.json
+
+        e.g. for metricName "ClusterMesh Kvstore Sync Queue Size {{$suffix}}"
+        with suffix=Rung0:
+            GenericPrometheusQuery ClusterMesh Kvstore Sync Queue Size Rung0_clustermesh-upper-bound_2026-05-15T02:20:27Z.json
+
+        We match on the production format primarily, with a fallback to the
+        compact-no-space underscore format
+            GenericPrometheusQuery_<MetricNameNoSpaces><Suffix>_<group>_<ts>.json
+        for backward compat with mock fixtures + any other CL2 versions
+        that strip spaces.
+        """
+        # Production format (build 67211 confirmed): space-separated, suffix
+        # immediately follows metric name with a space (because the YAML
+        # template `metricName: <name> {{$suffix}}` keeps the space).
+        prod_target = f"GenericPrometheusQuery {metric_name_prefix} {rung_suffix}_"
+        # Mock/compact fallback: drop spaces, no leading space after method.
+        compact_metric = metric_name_prefix.replace(" ", "")
+        compact_target = f"GenericPrometheusQuery_{compact_metric}{rung_suffix}_"
+        matches = [
+            f for f in all_files
+            if (f.startswith(prod_target) or f.startswith(compact_target))
+            and f.endswith(".json")
+        ]
+        if matches:
+            return os.path.join(cl2_report_dir, matches[0])
+        return None
+
+    # Signal name → (metricName-from-YAML, metric-label, transform).
+    # The metricName is the YAML's `metricName:` field text (space-separated),
+    # which is what CL2 embeds in the emitted filename. Build 67211 verified
+    # the production filename pattern.
+    #
+    # Transform converts the measurement's native unit into the classifier's
+    # threshold unit (seconds → milliseconds where applicable).
+    signal_map = {
+        "latency_p99_ms": (
+            "ClusterMesh Kvstore Operation Duration", "Perc99",
+            lambda v: v * 1000.0,
+        ),
+        "queue_size_perc99": (
+            "ClusterMesh Kvstore Sync Queue Size", "Perc99",
+            lambda v: v,
+        ),
+        "queue_size_max": (
+            "ClusterMesh Kvstore Sync Queue Size", "Max",
+            lambda v: v,
+        ),
+        "apiserver_max_cpu_cores": (
+            "ClusterMesh APIServer Pod CPU", "PerPodMax",
+            lambda v: v,
+        ),
+        "mesh_failure_rate_max": (
+            "ClusterMesh Remote Cluster Failure Rate", "Max",
+            lambda v: v,
+        ),
+        "etcd_commit_p99_ms": (
+            "ClusterMesh Etcd Backend Write Duration", "Perc99",
+            lambda v: v * 1000.0,
+        ),
+        "observed_event_rate_p99": (
+            "ClusterMesh Kvstore Events Rate", "Perc99",
+            lambda v: v,
+        ),
+    }
+    # Criterion → signal-name driving the verdict. Each criterion's ratio
+    # is observed/threshold; ≥1.0 = tripped. Dominant criterion = the
+    # tripped one with the highest ratio.
+    criteria = {
+        "latency_spike": "latency_p99_ms",
+        "queue_unbounded": "queue_size_perc99",
+        "cpu_exhaust": "apiserver_max_cpu_cores",
+        "mesh_failure_burst": "mesh_failure_rate_max",
+        "etcd_tail": "etcd_commit_p99_ms",
+    }
+
+    rungs_completed = 0
+    first_failure_index = None
+    first_failure_qps = None
+    first_failure_mode = None
+    second_failure_mode = None
+    max_clean_qps = None
+    clean_streak_broken = False
+
+    with open(result_file, "a", encoding="utf-8") as out:
+        for rung_idx, qps in enumerate(qps_list):
+            suffix = f"Rung{rung_idx}"
+            restarts = restarts_list[rung_idx]
+
+            signals = {}
+            measurement_missing = []
+            for sig_name, (ident, metric_label, transform) in signal_map.items():
+                fpath = _find_file(suffix, ident)
+                if fpath is None:
+                    signals[sig_name] = None
+                    measurement_missing.append(sig_name)
+                    continue
+                raw = _read_metric(fpath, metric_label)
+                if raw is None:
+                    signals[sig_name] = None
+                    measurement_missing.append(sig_name)
+                else:
+                    signals[sig_name] = transform(raw)
+
+            # Rung "completed" iff at least one signal landed AND the
+            # latency signal landed (proxy for "the rung executed and CL2
+            # gathered measurements for it"). Tuned conservatively so a
+            # half-collected rung is flagged for re-investigation rather
+            # than silently summarized.
+            rung_completed = (
+                signals.get("latency_p99_ms") is not None
+                and len(measurement_missing) < len(signal_map)
+            )
+            if rung_completed:
+                rungs_completed += 1
+
+            # Compute per-criterion ratios. None signals = criterion
+            # skipped (cannot contribute to verdict).
+            all_verdicts = {}
+            for criterion, sig_name in criteria.items():
+                v = signals.get(sig_name)
+                if v is None:
+                    continue
+                threshold = SATURATION_THRESHOLDS[
+                    sig_name if sig_name in SATURATION_THRESHOLDS
+                    else "latency_p99_ms"  # never hits — defensive
+                ]
+                if threshold <= 0:
+                    continue
+                all_verdicts[criterion] = v / threshold
+
+            tripped = {c: r for c, r in all_verdicts.items() if r >= 1.0}
+            if tripped:
+                verdict = max(tripped, key=tripped.get)
+                dominant_ratio = tripped[verdict]
+            elif (not rung_completed and rungs_completed > 0):
+                # Phase 4b — Scenario #6 monitoring_oom verdict (added
+                # 2026-05-15 after build 67279 showed Prometheus crashed
+                # mid-run at Rung 2-3, losing all measurements for those
+                # rungs). When an earlier rung completed but the current
+                # rung's measurements all came back empty, the most likely
+                # explanation is that the monitoring stack (Prometheus
+                # pod) ran out of memory / went CrashLoopBackOff under
+                # the elevated workload pressure of the higher rung.
+                # That IS a saturation finding per spec line 113
+                # ("Resource exhaustion occurs") — record it as a real
+                # verdict instead of silently leaving the rung as
+                # verdict=clean rung_completed=False which underclaims
+                # the failure.
+                #
+                # Synthetic dominant_signal_ratio=999.0 so dashboards
+                # ordering verdicts by severity rank this above other
+                # tripped criteria. The actual signal that drove the
+                # OOM (CPU, memory, query queue, cardinality explosion)
+                # is NOT distinguishable from blob output alone — needs
+                # Prom pod logs to triage.
+                verdict = "monitoring_oom"
+                dominant_ratio = 999.0
+            else:
+                verdict = "clean"
+                dominant_ratio = max(all_verdicts.values()) if all_verdicts else 0.0
+
+            # Track per-cluster summary fields. max_clean_qps is the
+            # highest qps in a CONTIGUOUS clean+completed prefix — once
+            # a non-clean rung lands we stop extending it (a brief
+            # later-rung "false clean" shouldn't disqualify the genuine
+            # earlier failure).
+            if verdict == "clean" and rung_completed and not clean_streak_broken:
+                if max_clean_qps is None or qps > max_clean_qps:
+                    max_clean_qps = qps
+            else:
+                clean_streak_broken = True
+                if verdict != "clean":
+                    if first_failure_index is None:
+                        first_failure_index = rung_idx
+                        first_failure_qps = qps
+                        first_failure_mode = verdict
+                    elif (second_failure_mode is None
+                          and verdict != first_failure_mode):
+                        second_failure_mode = verdict
+
+            rung_row = json.loads(json.dumps(template))
+            rung_row["measurement"] = "SaturationRung"
+            rung_row["group"] = "upper-bound"
+            rung_row["result"] = {
+                "data": {
+                    "rung_index": rung_idx,
+                    "configured_qps": qps,
+                    "configured_restarts": restarts,
+                    "classifier_version": SATURATION_CLASSIFIER_VERSION,
+                    "thresholds": SATURATION_THRESHOLDS,
+                    "verdict": verdict,
+                    "dominant_signal_ratio": dominant_ratio,
+                    "rung_completed": rung_completed,
+                    "measurement_missing": measurement_missing,
+                    "signals": signals,
+                    "all_verdicts": all_verdicts,
+                },
+                "unit": "verdict",
+            }
+            out.write(json.dumps(rung_row) + "\n")
+
+            # Per-rung stderr summary: greppable line for AzDO postmortem
+            # ("collect saturation rung=2 verdict=queue_unbounded ratio=5.0").
+            # Counts signals found out of expected so partial rungs surface.
+            print(
+                f"[collect] saturation: rung={rung_idx} qps={qps} "
+                f"restarts={restarts} verdict={verdict} "
+                f"dominant_ratio={dominant_ratio:.3f} "
+                f"completed={rung_completed} "
+                f"signals_found={len(signal_map) - len(measurement_missing)}/{len(signal_map)} "
+                f"missing={measurement_missing}",
+                file=sys.stderr,
+            )
+
+        summary_row = json.loads(json.dumps(template))
+        summary_row["measurement"] = "SaturationSummary"
+        summary_row["group"] = "upper-bound"
+        summary_row["result"] = {
+            "data": {
+                "rungs_configured": len(qps_list),
+                "rungs_completed": rungs_completed,
+                "max_clean_qps": max_clean_qps,
+                "first_failure_rung_index": first_failure_index,
+                "first_failure_qps": first_failure_qps,
+                "first_failure_mode": first_failure_mode,
+                "second_failure_mode": second_failure_mode,
+                "configured_qps_list": qps_list,
+                "configured_restarts_list": restarts_list,
+                "classifier_version": SATURATION_CLASSIFIER_VERSION,
+                "thresholds": SATURATION_THRESHOLDS,
+            },
+            "unit": "verdict",
+        }
+        out.write(json.dumps(summary_row) + "\n")
+
+        # Stderr summary for AzDO postmortem; greppable headline line.
+        print(
+            f"[collect] saturation: SUMMARY rungs_completed={rungs_completed}/{len(qps_list)} "
+            f"max_clean_qps={max_clean_qps} "
+            f"first_failure_qps={first_failure_qps} "
+            f"first_failure_mode={first_failure_mode} "
+            f"second_failure_mode={second_failure_mode} "
+            f"classifier_version={SATURATION_CLASSIFIER_VERSION}",
+            file=sys.stderr,
+        )
+
+
+def _emit_node_churn_timing_rows(cl2_report_dir, template, result_file):
+    """Append one JSONL row per recorded op in NodeChurnTimings_*.json.
+
+    File shape (from node-churner.sh):
+        {
+          "target_context": str,
+          "target_cluster_name": str,
+          "target_resource_group": str,
+          "target_nodepool": str,
+          "scenario": "node-churn-scale" | "node-churn-replace" | "node-churn-combined",
+          "original_node_count": int,
+          "ready_quorum_reached": bool,
+          "cleanup_failed": bool,
+          "scenario_valid": bool,         // false if a circuit-breaker fired
+          "truncated": bool,              // true if churner ran past CL2 sleep
+          "started_epoch": int,
+          "ended_epoch": int,
+          "duration_seconds": int,
+          "ops": [
+            {
+              "op_index": int,
+              "op_type": "scale_up"|"scale_down"|"replace_drain"|"replace_delete"|"replace_refill"|"replace_wait",
+              "start_epoch": int,
+              "end_epoch": int,
+              "duration_seconds": int,
+              "succeeded": bool,
+              "observed_node_count": int,
+              "pre_ip_set": [str],         // only populated on replace_wait
+              "post_ip_set": [str],
+              "pre_node_names": [str],     // only populated on replace_wait
+              "post_node_names": [str],
+              "new_ip_count": int,         // INFORMATIONAL — Azure VNet allocator
+                                           // reuses freed IPs immediately so this
+                                           // may be 0 even after successful replacement
+              "new_node_count": int,       // AUTHORITATIVE replacement signal —
+                                           // VMSS instance IDs are monotonic so node
+                                           // names always differ after replacement
+              "error": str                 // empty on success
+            }, ...
+          ]
+        }
+
+    Each op becomes one row in the JSONL with
+    measurement="NodeChurnOpTiming", group=<scenario>, and result.data = the
+    per-op JSON, PLUS scenario-level fields copied onto result.data for
+    cross-row context (scenario_valid, cleanup_failed, truncated, etc.).
+    A scenario-level summary row with measurement="NodeChurnSummary" is also
+    emitted so Kusto queries can detect cleanup_failed / scenario_valid=false
+    runs without joining op rows. One summary row per timing file.
+    """
+    timing_files = [
+        f for f in os.listdir(cl2_report_dir)
+        if f.startswith("NodeChurnTimings_") and f.endswith(".json")
+    ]
+    if not timing_files:
+        return
+    scenario_level_keys = (
+        "scenario", "target_context", "target_cluster_name",
+        "target_resource_group", "target_nodepool",
+        "original_node_count", "ready_quorum_reached", "cleanup_failed",
+        "scenario_valid", "truncated", "started_epoch", "ended_epoch",
+        "duration_seconds",
+    )
+    with open(result_file, "a", encoding="utf-8") as out:
+        for tf in timing_files:
+            tf_path = os.path.join(cl2_report_dir, tf)
+            try:
+                with open(tf_path, "r", encoding="utf-8") as tfh:
+                    timing_data = json.load(tfh)
+            except (OSError, json.JSONDecodeError) as e:
+                print(
+                    f"[collect] WARN: failed to read {tf_path}: {e}",
+                    file=sys.stderr,
+                )
+                continue
+            scenario_context = {
+                k: timing_data.get(k) for k in scenario_level_keys
+            }
+            # One summary row per file — always emitted, even if ops list is
+            # empty (e.g., quorum never reached → churner aborted before any op).
+            summary_row = json.loads(json.dumps(template))
+            summary_row["measurement"] = "NodeChurnSummary"
+            summary_row["group"] = timing_data.get("scenario", "node-churn")
+            summary_row["result"] = {
+                "data": {
+                    **scenario_context,
+                    "op_count": len(timing_data.get("ops") or []),
+                },
+                "unit": "seconds",
+            }
+            out.write(json.dumps(summary_row) + "\n")
+            # One row per op, with scenario_context merged onto result.data so
+            # a single Kusto filter (e.g., scenario_valid=true) gates op-level
+            # analysis without needing a join.
+            for op in timing_data.get("ops") or []:
+                op_row = json.loads(json.dumps(template))
+                op_row["measurement"] = "NodeChurnOpTiming"
+                op_row["group"] = timing_data.get("scenario", "node-churn")
+                op_row["result"] = {
+                    "data": {**scenario_context, **op},
+                    "unit": "seconds",
+                }
+                out.write(json.dumps(op_row) + "\n")
+
+
+def _emit_apiserver_failure_timing_rows(cl2_report_dir, template, result_file):
+    """Append one JSONL row per ApiserverFailureTimings_*.json found.
+
+    The timing file shape (from apiserver-failure-killer.sh):
+        {
+          "target_context": str,
+          "t0_kill_epoch": int,
+          "t1_recovered_epoch": int,
+          "recovery_duration_seconds": int,
+          "recovered": bool,
+          "killed_pod_name": str,
+          "killed_pod_uid": str,
+          "replacement_pod_uid": str,
+          "note": str
+        }
+
+    Each timing file becomes one row in the JSONL with
+    measurement="ApiserverFailureRecoveryTiming", group="apiserver-failure",
+    and result.data = the timing JSON. Downstream Kusto queries can filter
+    on this measurement name to get per-run recovery timings keyed by
+    test_type=apiserver-failure + cluster.
+    """
+    timing_files = [
+        f for f in os.listdir(cl2_report_dir)
+        if f.startswith("ApiserverFailureTimings_") and f.endswith(".json")
+    ]
+    if not timing_files:
+        return
+    with open(result_file, "a", encoding="utf-8") as out:
+        for tf in timing_files:
+            tf_path = os.path.join(cl2_report_dir, tf)
+            try:
+                with open(tf_path, "r", encoding="utf-8") as tfh:
+                    timing_data = json.load(tfh)
+            except (OSError, json.JSONDecodeError) as e:
+                print(
+                    f"[collect] WARN: failed to read {tf_path}: {e}",
+                    file=sys.stderr,
+                )
+                continue
+            # Deep-copy template so we don't mutate the shared dict for any
+            # downstream caller.
+            row = json.loads(json.dumps(template))
+            row["measurement"] = "ApiserverFailureRecoveryTiming"
+            row["group"] = "apiserver-failure"
+            row["result"] = {"data": timing_data, "unit": "seconds"}
+            out.write(json.dumps(row) + "\n")
+
+
+def _emit_ha_config_scaling_rows(cl2_report_dir, template, result_file):
+    """Append one JSONL row per HAConfigScalingTimings_*.json found.
+
+    The scaling file shape (from ha-config-scaler.sh):
+        {
+          "context": str,
+          "action": "scale-up" | "scale-down",
+          "requested_replicas": int,
+          "spec_replicas_after": int,
+          "ready_replicas_after": int,
+          "ha_replicas_honored": bool,
+          "scale_duration_seconds": int,
+          "note": str
+        }
+
+    Each file becomes one row in the JSONL with
+    measurement="HAConfigScalingTiming", group="ha-config", and
+    result.data = the scaling JSON. Only scale-up emits a file; scale-down
+    is best-effort cleanup that does NOT overwrite the scale-up file.
+    Downstream Kusto queries can filter on measurement="HAConfigScalingTiming"
+    and ha_replicas_honored=true to scope HA A/B comparisons to runs where
+    the scale actually stuck (ENO operator did not revert).
+    """
+    timing_files = [
+        f for f in os.listdir(cl2_report_dir)
+        if f.startswith("HAConfigScalingTimings_") and f.endswith(".json")
+    ]
+    if not timing_files:
+        return
+    with open(result_file, "a", encoding="utf-8") as out:
+        for tf in timing_files:
+            tf_path = os.path.join(cl2_report_dir, tf)
+            try:
+                with open(tf_path, "r", encoding="utf-8") as tfh:
+                    scaling_data = json.load(tfh)
+            except (OSError, json.JSONDecodeError) as e:
+                print(
+                    f"[collect] WARN: failed to read {tf_path}: {e}",
+                    file=sys.stderr,
+                )
+                continue
+            row = json.loads(json.dumps(template))
+            row["measurement"] = "HAConfigScalingTiming"
+            row["group"] = "ha-config"
+            row["result"] = {"data": scaling_data, "unit": "seconds"}
+            out.write(json.dumps(row) + "\n")
+
 
 def main():
     parser = argparse.ArgumentParser(description="ClusterMesh scale-test harness.")
@@ -183,6 +1284,132 @@ def main():
     pc.add_argument("--operation-timeout", type=str, default="15m")
     pc.add_argument("--cl2_override_file", type=str, required=True,
                     help="Path to the overrides of CL2 config file")
+    # Phase 4a — Scenario #2 (Pod Churn Stress) knobs. Defaults match the
+    # pipeline matrix defaults so a configure invocation that doesn't pass
+    # these still writes valid overrides for both pod-churn-scale.yaml and
+    # pod-churn-kill.yaml.
+    pc.add_argument("--churn-cycles", type=int, default=5,
+                    help="Number of scale-up/down cycles (pod-churn-scale).")
+    pc.add_argument("--churn-up-duration", type=str, default="60s",
+                    help="Sleep between scale-up and next scale-down (pod-churn-scale).")
+    pc.add_argument("--churn-down-duration", type=str, default="60s",
+                    help="Sleep between scale-down and next scale-up (pod-churn-scale).")
+    pc.add_argument("--kill-duration", type=str, default="10m",
+                    help="Total kill-loop duration as a human string (logged only). "
+                         "The runtime is bounded by --kill-duration-seconds.")
+    pc.add_argument("--kill-interval-seconds", type=int, default=10,
+                    help="Seconds between successive kill rounds (pod-churn-kill).")
+    pc.add_argument("--kill-batch", type=int, default=5,
+                    help="Pods deleted per round (pod-churn-kill).")
+    pc.add_argument("--kill-duration-seconds", type=int, default=600,
+                    help="Killer Job script runtime in seconds (pod-churn-kill).")
+    pc.add_argument("--kill-job-deadline-seconds", type=int, default=660,
+                    help="Killer Job activeDeadlineSeconds — defense-in-depth bound, "
+                         "should be kill_duration_seconds plus a small buffer.")
+    # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs.
+    pc.add_argument("--apiserver-kill-target-context", type=str, default="clustermesh-1",
+                    help="kubectl context name of the cluster whose clustermesh-apiserver "
+                         "to kill. Other clusters no-op (per-cluster CL2 with shared overrides).")
+    pc.add_argument("--apiserver-kill-recovery-timeout-seconds", type=int, default=240,
+                    help="How long to wait for the replacement clustermesh-apiserver pod "
+                         "to reach Ready after kill. AKS-managed Cilium can take "
+                         "120-180s in our observed runs (image pull + ENI attach); "
+                         "240s gives headroom. Killer fails soft on timeout — writes "
+                         "timing JSON with recovered:false instead of erroring.")
+    pc.add_argument("--apiserver-kill-observation-seconds", type=int, default=60,
+                    help="Sleep duration AFTER the kill returns, before measurement gather. "
+                         "Lets peer clusters' Prometheus scrape the failure window and "
+                         "the post-recovery backlog drain.")
+    # Phase 4b — Scenario #7 (HA Configuration Validation) knob.
+    pc.add_argument("--ha-config-replicas", type=int, default=3,
+                    help="Target replicas count for clustermesh-apiserver Deployment "
+                         "during the ha-config scenario. Each cluster scales its own "
+                         "Deployment to this count before measurements start, then back "
+                         "to 1 after gather. Default 3 (standard k8s HA, etcd quorum-friendly).")
+    # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs.
+    # CL2 templates that don't reference these silently ignore (same pattern
+    # as the apiserver / ha-config knobs). node-churner.sh consumes them via
+    # matrix-exported env vars in execute.yml — NOT via these overrides.
+    pc.add_argument("--node-churn-target-context", type=str, default="clustermesh-1",
+                    help="kubectl context name of the cluster whose default nodepool "
+                         "is scaled / replaced. Other clusters observe via CL2. "
+                         "Reuses the apiserver-failure target convention.")
+    pc.add_argument("--node-churn-cycles", type=int, default=3,
+                    help="Number of scale-up/down cycles in node-churn-scale. "
+                         "Each cycle does ONE scale-up by --node-churn-delta then ONE "
+                         "scale-down by the same delta with --node-churn-settle-seconds "
+                         "between ops. 3 cycles × 2 ops × ~4min/op = ~24min wall.")
+    pc.add_argument("--node-churn-delta", type=int, default=5,
+                    help="Per-half-cycle scale delta. +N on scale-up, -N on scale-down. "
+                         "Default 5 → 20→25→20 cycles. Bounded above by AKS vCPU quota.")
+    pc.add_argument("--node-churn-settle-seconds", type=int, default=60,
+                    help="Sleep between consecutive nodepool ops to let cilium "
+                         "reconcile node identities + endpoints before next op.")
+    pc.add_argument("--node-churn-scale-duration-seconds", type=int, default=1800,
+                    help="CL2-side sleep window for node-churn-scale.yaml. Must be "
+                         "≥ expected churner wall time + settle margin. 1800s = 30min "
+                         "covers 3-cycle scale at ~24min churner wall.")
+    pc.add_argument("--node-churn-replace-duration-seconds", type=int, default=1500,
+                    help="CL2-side sleep window for node-churn-replace.yaml. "
+                         "1500s = 25min covers VMSS-delete-and-replace of ~10 instances "
+                         "in parallel (each drain+replace ~5-10min, parallelized).")
+    pc.add_argument("--node-churn-combined-duration-seconds", type=int, default=3300,
+                    help="CL2-side sleep window for node-churn-combined.yaml "
+                         "(scale phase + replace phase serially). Sum of the two "
+                         "individual windows plus margin.")
+    pc.add_argument("--node-replace-batch-size", type=int, default=10,
+                    help="Number of VMSS instances to drain+delete in the replace "
+                         "scenario. AKS auto-replaces to restore the desired count, "
+                         "yielding K new VMs with new IPs. 10 of 20 default nodes = "
+                         "50%% pool replacement; bounded above by --max-surge fraction "
+                         "Cilium can tolerate without endpoint floods saturating the mesh.")
+    pc.add_argument("--node-churn-ready-timeout-seconds", type=int, default=300,
+                    help="How long node-churner.sh waits for per-cluster CL2 ready "
+                         "sentinels before starting the first nodepool op. If quorum "
+                         "(all clusters' sentinels) isn't reached within this window, "
+                         "the churner aborts WITH cleanup (restores pool to original "
+                         "node count) and marks scenario_valid=false in the timing JSON.")
+    # Phase 4b — Scenario #6 (Upper Bound / Saturation) knobs.
+    # Each upper-bound CL2 run sweeps through N rungs of progressively
+    # heavier load (QPS × restart count). The classifier in collect emits
+    # one SaturationRung row per rung tagging which signal tripped
+    # (clean | latency_spike | queue_unbounded | cpu_exhaust |
+    # mesh_failure_burst | etcd_tail). See SATURATION_THRESHOLDS at the
+    # top of this module + plan.md Scenario #6 section.
+    pc.add_argument("--saturation-qps-list", type=str, default="100,500,1500,4000,10000",
+                    help="Comma-separated list of QPS values, one per saturation "
+                         "rung. Length determines number of rungs; CL2's "
+                         "upper-bound.yaml parses this via StringSplit. "
+                         "Default is a 5-rung sweep (100, 500, 1500, 4000, 10000 "
+                         "calls/sec) — bumped 2026-05-15 after build 67224 showed "
+                         "all signals at 1-15%% of thresholds at the prior top rung "
+                         "(qps=160, restarts=4). QPS above ~100 is effectively "
+                         "uncapped for our 20-deployment workload (CL2 apply "
+                         "throughput is the ceiling, not QPS itself); "
+                         "saturation_restarts_list is the real load lever.")
+    pc.add_argument("--saturation-restarts-list", type=str, default="2,4,8,15,25",
+                    help="Comma-separated list of restart counts, one per saturation "
+                         "rung (length must match --saturation-qps-list). Each rung's "
+                         "workload is restart-bursted this many times so cumulative "
+                         "event volume scales with rung index even when CL2's "
+                         "Deployment-apply QPS saturates. Restart count is the "
+                         "primary load lever: each restart triggers ~200 pod recreates "
+                         "(at n=2 with 200-pod workload), each emitting endpoint + "
+                         "identity + service events through the mesh.")
+    pc.add_argument("--saturation-rung-duration-seconds", type=int, default=240,
+                    help="Wall-clock duration each rung holds after its restart-burst "
+                         "before measurements are gathered. Drives the per-rung "
+                         "measurement window (CL2 substitutes %%v in queries with "
+                         "wall time since the matching `start` action). Bumped "
+                         "180s\u2192240s 2026-05-15 to give higher rungs time to "
+                         "accumulate meaningful signal at the post-burst tail.")
+    pc.add_argument("--saturation-settle-seconds", type=int, default=90,
+                    help="Sleep between rungs so kvstore queues from rung r drain "
+                         "before rung r+1's measurement window opens. Insufficient "
+                         "settle biases later rungs' verdicts toward `queue_unbounded` "
+                         "even if the queue would have drained on its own. Bumped "
+                         "60s\u219290s 2026-05-15 since higher restart bursts take "
+                         "longer to fully drain queues.")
 
     # execute
     pe = subparsers.add_parser("execute", help="Run CL2 against a single cluster")
@@ -192,6 +1419,39 @@ def main():
     pe.add_argument("--cl2-config-file", type=str, required=True)
     pe.add_argument("--kubeconfig", type=str, required=True)
     pe.add_argument("--provider", type=str, required=True)
+    pe.add_argument("--tear-down-prometheus", action="store_true",
+                    help="Tear down Prometheus stack at end of CL2 (set in share-infra "
+                         "mode so the next scenario's CL2 can deploy a fresh Prom). "
+                         "Default is to preserve Prom for failure-diagnostic dumping.")
+
+    # execute-parallel — fan out CL2 across N clusters with bounded concurrency
+    pep = subparsers.add_parser(
+        "execute-parallel",
+        help="Run CL2 across multiple clusters with bounded concurrency",
+    )
+    pep.add_argument("--clusters", type=str, required=True,
+                     help="Path to JSON file containing array of cluster objects, "
+                          "each with at least 'role' and 'kubeconfig' fields")
+    pep.add_argument("--max-concurrent", type=int, default=4,
+                     help="Maximum number of CL2 invocations to run in parallel")
+    pep.add_argument("--worker-script", type=str, required=True,
+                     help="Path to per-cluster bash worker (run-cl2-on-cluster.sh)")
+    pep.add_argument("--cl2-image", type=str, required=True)
+    pep.add_argument("--cl2-config-dir", type=str, required=True)
+    pep.add_argument("--cl2-config-file", type=str, required=True)
+    pep.add_argument("--cl2-report-dir-base", type=str, required=True,
+                     help="Base directory; per-cluster reports land at <base>/<role>/")
+    pep.add_argument("--provider", type=str, required=True)
+    pep.add_argument("--python-script-file", type=str, required=True,
+                     help="Path to this scale.py — invoked by the worker script "
+                          "via `python3 <path> execute ...`")
+    pep.add_argument("--python-workdir", type=str, required=True,
+                     help="Working dir for the nested python execute call "
+                          "(typically modules/python so PYTHONPATH resolves)")
+    pep.add_argument("--tear-down-prometheus", action="store_true",
+                     help="Pass through to each per-cluster CL2 invocation; used in "
+                          "share-infra mode where multiple scenarios share infra and "
+                          "each needs a clean Prometheus deploy.")
 
     # collect
     pco = subparsers.add_parser("collect", help="Collect results for one cluster")
@@ -213,6 +1473,27 @@ def main():
     pco.add_argument("--deployments-per-namespace", type=int, required=True)
     pco.add_argument("--replicas-per-deployment", type=int, required=True)
     pco.add_argument("--trigger_reason", type=str, default="")
+    # Phase 4a — pod-churn knobs recorded into the JSONL for historical
+    # comparison. Optional; default to 0/"" so non-churn test_types
+    # (event-throughput, default-config) don't need to set them.
+    pco.add_argument("--churn-cycles", type=int, default=0)
+    pco.add_argument("--churn-up-duration", type=str, default="")
+    pco.add_argument("--churn-down-duration", type=str, default="")
+    pco.add_argument("--kill-duration-seconds", type=int, default=0)
+    pco.add_argument("--kill-interval-seconds", type=int, default=0)
+    pco.add_argument("--kill-batch", type=int, default=0)
+    # Phase 4b — Scenario #6 (Upper Bound / Saturation) collect knobs.
+    # Optional; default to empty string so non-saturation test_types skip
+    # the classifier entirely (zero overhead). For upper-bound test_types,
+    # collect.yml plumbs the matrix-configured saturation_qps_list +
+    # saturation_restarts_list into these args so the classifier records
+    # the actual QPS and restart values that drove each rung.
+    pco.add_argument("--saturation-qps-list", type=str, default="",
+                     help="Comma-separated QPS values from the upper-bound run. "
+                          "Empty = not an upper-bound run; classifier is no-op.")
+    pco.add_argument("--saturation-restarts-list", type=str, default="",
+                     help="Comma-separated restart counts from the upper-bound run "
+                          "(length must match --saturation-qps-list).")
 
     args = parser.parse_args()
 
@@ -223,6 +1504,31 @@ def main():
             args.replicas_per_deployment,
             args.operation_timeout,
             args.cl2_override_file,
+            churn_cycles=args.churn_cycles,
+            churn_up_duration=args.churn_up_duration,
+            churn_down_duration=args.churn_down_duration,
+            kill_duration=args.kill_duration,
+            kill_interval_seconds=args.kill_interval_seconds,
+            kill_batch=args.kill_batch,
+            kill_duration_seconds=args.kill_duration_seconds,
+            kill_job_deadline_seconds=args.kill_job_deadline_seconds,
+            apiserver_kill_target_context=args.apiserver_kill_target_context,
+            apiserver_kill_recovery_timeout_seconds=args.apiserver_kill_recovery_timeout_seconds,
+            apiserver_kill_observation_seconds=args.apiserver_kill_observation_seconds,
+            ha_config_replicas=args.ha_config_replicas,
+            node_churn_target_context=args.node_churn_target_context,
+            node_churn_cycles=args.node_churn_cycles,
+            node_churn_delta=args.node_churn_delta,
+            node_churn_settle_seconds=args.node_churn_settle_seconds,
+            node_churn_scale_duration_seconds=args.node_churn_scale_duration_seconds,
+            node_churn_replace_duration_seconds=args.node_churn_replace_duration_seconds,
+            node_churn_combined_duration_seconds=args.node_churn_combined_duration_seconds,
+            node_replace_batch_size=args.node_replace_batch_size,
+            node_churn_ready_timeout_seconds=args.node_churn_ready_timeout_seconds,
+            saturation_qps_list=args.saturation_qps_list,
+            saturation_restarts_list=args.saturation_restarts_list,
+            saturation_rung_duration_seconds=args.saturation_rung_duration_seconds,
+            saturation_settle_seconds=args.saturation_settle_seconds,
         )
     elif args.command == "execute":
         execute_clusterloader2(
@@ -232,7 +1538,23 @@ def main():
             args.cl2_config_file,
             args.kubeconfig,
             args.provider,
+            tear_down_prometheus=args.tear_down_prometheus,
+        )
+    elif args.command == "execute-parallel":
+        rc = execute_parallel(
+            clusters_file=args.clusters,
+            max_concurrent=args.max_concurrent,
+            worker_script=args.worker_script,
+            cl2_image=args.cl2_image,
+            cl2_config_dir=args.cl2_config_dir,
+            cl2_config_file=args.cl2_config_file,
+            cl2_report_dir_base=args.cl2_report_dir_base,
+            provider=args.provider,
+            python_script_file=args.python_script_file,
+            python_workdir=args.python_workdir,
+            tear_down_prometheus=args.tear_down_prometheus,
         )
+        sys.exit(rc)
     elif args.command == "collect":
         collect_clusterloader2(
             args.cl2_report_dir,
@@ -249,6 +1571,14 @@ def main():
             args.deployments_per_namespace,
             args.replicas_per_deployment,
             args.trigger_reason,
+            churn_cycles=args.churn_cycles,
+            churn_up_duration=args.churn_up_duration,
+            churn_down_duration=args.churn_down_duration,
+            kill_duration_seconds=args.kill_duration_seconds,
+            kill_interval_seconds=args.kill_interval_seconds,
+            kill_batch=args.kill_batch,
+            saturation_qps_list=args.saturation_qps_list,
+            saturation_restarts_list=args.saturation_restarts_list,
         )
     else:
         parser.print_help()
diff --git a/modules/python/tests/mock_data/.gitignore b/modules/python/tests/mock_data/.gitignore
new file mode 100644
index 0000000000..49abfda49a
--- /dev/null
+++ b/modules/python/tests/mock_data/.gitignore
@@ -0,0 +1,8 @@
+# Mock fixture log files are intentionally checked in (synthetic content,
+# bytes-small) so test_clustermesh_scale's TestMockFixtureParity can verify
+# the mock matches what run-cl2-on-cluster.sh produces in real runs.
+# Without this exception the root *.log ignore strips them, the parity test
+# fails locally on a fresh clone, and collect_clusterloader2 tests don't
+# exercise the logs/-subdir-present shape — the exact gap that let an
+# IsADirectoryError land in CI.
+!*.log
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-agent.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-agent.log
new file mode 100644
index 0000000000..ac2b9403b1
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-agent.log
@@ -0,0 +1 @@
+# synthetic cilium-agent.log for mesh-1 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-operator.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-operator.log
new file mode 100644
index 0000000000..2d665012b3
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/cilium-operator.log
@@ -0,0 +1 @@
+# synthetic cilium-operator.log for mesh-1 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-apiserver.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-apiserver.log
new file mode 100644
index 0000000000..786823cedc
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-apiserver.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-apiserver.log for mesh-1 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-etcd.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-etcd.log
new file mode 100644
index 0000000000..620dc1d5e0
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-etcd.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-etcd.log for mesh-1 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-kvstoremesh.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-kvstoremesh.log
new file mode 100644
index 0000000000..ae2fb8cd9c
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/logs/clustermesh-apiserver-kvstoremesh.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-kvstoremesh.log for mesh-1 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-agent.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-agent.log
new file mode 100644
index 0000000000..2e0dda9c48
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-agent.log
@@ -0,0 +1 @@
+# synthetic cilium-agent.log for mesh-2 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-operator.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-operator.log
new file mode 100644
index 0000000000..e4b00b1cc9
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/cilium-operator.log
@@ -0,0 +1 @@
+# synthetic cilium-operator.log for mesh-2 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-apiserver.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-apiserver.log
new file mode 100644
index 0000000000..af21cefef0
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-apiserver.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-apiserver.log for mesh-2 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-etcd.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-etcd.log
new file mode 100644
index 0000000000..5422124e72
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-etcd.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-etcd.log for mesh-2 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-kvstoremesh.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-kvstoremesh.log
new file mode 100644
index 0000000000..279d5da2e5
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/logs/clustermesh-apiserver-kvstoremesh.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-kvstoremesh.log for mesh-2 (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-agent.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-agent.log
new file mode 100644
index 0000000000..d5c76f10b4
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-agent.log
@@ -0,0 +1 @@
+# synthetic cilium-agent.log for mesh-fail (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-operator.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-operator.log
new file mode 100644
index 0000000000..c404208c5c
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/cilium-operator.log
@@ -0,0 +1 @@
+# synthetic cilium-operator.log for mesh-fail (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-apiserver.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-apiserver.log
new file mode 100644
index 0000000000..ab1ad57a6a
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-apiserver.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-apiserver.log for mesh-fail (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-etcd.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-etcd.log
new file mode 100644
index 0000000000..01e52d4c6d
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-etcd.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-etcd.log for mesh-fail (mock fixture)
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-kvstoremesh.log b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-kvstoremesh.log
new file mode 100644
index 0000000000..6e347842d5
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/logs/clustermesh-apiserver-kvstoremesh.log
@@ -0,0 +1 @@
+# synthetic clustermesh-apiserver-kvstoremesh.log for mesh-fail (mock fixture)
diff --git a/modules/python/tests/test_clustermesh_scale.py b/modules/python/tests/test_clustermesh_scale.py
index 0b9dd7510e..afb42522ac 100644
--- a/modules/python/tests/test_clustermesh_scale.py
+++ b/modules/python/tests/test_clustermesh_scale.py
@@ -11,11 +11,17 @@
 this, downstream Kusto queries cannot group/filter by cluster across the mesh.
 """
 import importlib.util
+import io
 import json
 import os
+import shutil
 import sys
 import tempfile
+import threading
+import time
 import unittest
+from contextlib import redirect_stdout
+from glob import glob
 from pathlib import Path
 from unittest.mock import patch
 
@@ -41,6 +47,71 @@
     os.path.dirname(__file__), "mock_data", "clustermesh-scale", "report"
 )
 
+# Files/dirs that run-cl2-on-cluster.sh writes into every per-cluster
+# $report_dir. Any new artifact added there MUST be mirrored in
+# mock_data/clustermesh-scale/report/mesh-*/ so the local test suite
+# exercises the same shape collect_clusterloader2 sees in real runs.
+# The TestMockFixtureParity class below enforces this.
+EXPECTED_PER_CLUSTER_ARTIFACTS = {
+    "files": ["junit.xml"],
+    "file_globs": ["*.json"],
+    "subdirs": ["logs"],
+    "logs_files": [
+        "clustermesh-apiserver-apiserver.log",
+        "clustermesh-apiserver-etcd.log",
+        "clustermesh-apiserver-kvstoremesh.log",
+        "cilium-agent.log",
+        "cilium-operator.log",
+    ],
+}
+
+
+class TestMockFixtureParity(unittest.TestCase):
+    """Mock data must mirror the real run-cl2-on-cluster.sh output layout.
+
+    Without this, collect_clusterloader2 tests can pass against a stale
+    mock while real runs crash on shapes the mock doesn't include —
+    exactly the IsADirectoryError on logs/ regression that triggered
+    adding this guard.
+    """
+
+    def _assert_cluster_dir_shape(self, cluster_dir):
+        for fname in EXPECTED_PER_CLUSTER_ARTIFACTS["files"]:
+            self.assertTrue(
+                os.path.isfile(os.path.join(cluster_dir, fname)),
+                f"{cluster_dir}: missing required file {fname}",
+            )
+        for pattern in EXPECTED_PER_CLUSTER_ARTIFACTS["file_globs"]:
+            self.assertTrue(
+                glob(os.path.join(cluster_dir, pattern)),
+                f"{cluster_dir}: no file matches {pattern}",
+            )
+        for sd in EXPECTED_PER_CLUSTER_ARTIFACTS["subdirs"]:
+            self.assertTrue(
+                os.path.isdir(os.path.join(cluster_dir, sd)),
+                f"{cluster_dir}: missing required subdir {sd}/ "
+                f"(run-cl2-on-cluster.sh writes this; "
+                f"keep the mock in sync so collect tests stay realistic)",
+            )
+        log_dir = os.path.join(cluster_dir, "logs")
+        for lf in EXPECTED_PER_CLUSTER_ARTIFACTS["logs_files"]:
+            self.assertTrue(
+                os.path.isfile(os.path.join(log_dir, lf)),
+                f"{log_dir}: missing log file {lf}",
+            )
+
+    def test_mesh_1_mock_matches_engine_output(self):
+        """mesh-1 mock has the same shape as a real per-cluster report dir."""
+        self._assert_cluster_dir_shape(os.path.join(MOCK_REPORT_ROOT, "mesh-1"))
+
+    def test_mesh_2_mock_matches_engine_output(self):
+        """mesh-2 mock has the same shape as a real per-cluster report dir."""
+        self._assert_cluster_dir_shape(os.path.join(MOCK_REPORT_ROOT, "mesh-2"))
+
+    def test_mesh_fail_mock_matches_engine_output(self):
+        """mesh-fail mock has the same shape as a real per-cluster report dir."""
+        self._assert_cluster_dir_shape(os.path.join(MOCK_REPORT_ROOT, "mesh-fail"))
+
 
 class TestConfigureClustermeshScale(unittest.TestCase):
     """configure_clusterloader2 writes the CL2 overrides file the pipeline expects."""
@@ -72,7 +143,7 @@ def test_overrides_file_contents(self):
             # Prometheus pod to the dedicated `prompool` node defined in
             # azure-2.tfvars (label prometheus=true).
             self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content)
-            self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi", content)
+            self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT: 12Gi", content)
             self.assertIn('CL2_PROMETHEUS_NODE_SELECTOR: "prometheus: \\"true\\""', content)
             self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true", content)
             self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true", content)
@@ -110,6 +181,872 @@ def test_overrides_file_timeout_passthrough(self):
         finally:
             os.remove(tmp_path)
 
+    def test_overrides_file_emits_phase4a_pod_churn_defaults(self):
+        """Every CL2_* knob the pod-churn-{scale,kill}.yaml templates read must
+        be emitted by configure_clusterloader2, even when not passed explicitly —
+        so an event-throughput run that omits the churn args still produces
+        a valid overrides file that pod-churn templates would accept.
+
+        Defaults must match the documented Phase 4a defaults in plan.md.
+        """
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode="w+", encoding="utf-8"
+        ) as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            # pod-churn-scale knobs.
+            self.assertIn("CL2_CHURN_CYCLES: 5", content)
+            self.assertIn("CL2_CHURN_UP_DURATION: 60s", content)
+            self.assertIn("CL2_CHURN_DOWN_DURATION: 60s", content)
+            # pod-churn-kill knobs.
+            self.assertIn("CL2_KILL_DURATION: 10m", content)
+            self.assertIn("CL2_KILL_INTERVAL_SECONDS: 10", content)
+            self.assertIn("CL2_KILL_BATCH: 5", content)
+            self.assertIn("CL2_KILL_DURATION_SECONDS: 600", content)
+            # Job deadline must exceed kill_duration so the activeDeadlineSeconds
+            # safety net never fires before the killer's own time check.
+            self.assertIn("CL2_KILL_JOB_DEADLINE_SECONDS: 660", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_overrides_file_pod_churn_overrides_passthrough(self):
+        """Explicit churn args override the defaults in the overrides file."""
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode="w+", encoding="utf-8"
+        ) as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=5,
+                deployments_per_namespace=4,
+                replicas_per_deployment=10,
+                operation_timeout="20m",
+                override_file=tmp_path,
+                churn_cycles=3,
+                churn_up_duration="30s",
+                churn_down_duration="45s",
+                kill_duration="5m",
+                kill_interval_seconds=15,
+                kill_batch=3,
+                kill_duration_seconds=300,
+                kill_job_deadline_seconds=360,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn("CL2_CHURN_CYCLES: 3", content)
+            self.assertIn("CL2_CHURN_UP_DURATION: 30s", content)
+            self.assertIn("CL2_CHURN_DOWN_DURATION: 45s", content)
+            self.assertIn("CL2_KILL_DURATION: 5m", content)
+            self.assertIn("CL2_KILL_INTERVAL_SECONDS: 15", content)
+            self.assertIn("CL2_KILL_BATCH: 3", content)
+            self.assertIn("CL2_KILL_DURATION_SECONDS: 300", content)
+            self.assertIn("CL2_KILL_JOB_DEADLINE_SECONDS: 360", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_overrides_file_apiserver_failure_defaults(self):
+        """Phase 4b — Scenario #4 (APIServer Failure) knobs landed in overrides
+        with the documented defaults.
+
+        Same unconditional-write pattern as churn knobs: every configure call
+        writes these keys so a future event-throughput run with this overrides
+        file still produces a valid (if unused) override set for the apiserver
+        templates.
+        """
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode="w+", encoding="utf-8"
+        ) as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn("CL2_APISERVER_KILL_TARGET_CONTEXT: clustermesh-1", content)
+            self.assertIn("CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS: 240", content)
+            self.assertIn("CL2_APISERVER_KILL_OBSERVATION_SECONDS: 60", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_overrides_file_apiserver_failure_overrides_passthrough(self):
+        """Explicit apiserver-failure args override the defaults."""
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode="w+", encoding="utf-8"
+        ) as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+                apiserver_kill_target_context="clustermesh-5",
+                apiserver_kill_recovery_timeout_seconds=180,
+                apiserver_kill_observation_seconds=90,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn("CL2_APISERVER_KILL_TARGET_CONTEXT: clustermesh-5", content)
+            self.assertIn("CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS: 180", content)
+            self.assertIn("CL2_APISERVER_KILL_OBSERVATION_SECONDS: 90", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_overrides_file_ha_config_replicas_default(self):
+        """ha-config replicas default to 3 (standard k8s HA)."""
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode="w+", encoding="utf-8"
+        ) as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn("CL2_HA_CONFIG_REPLICAS: 3", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_overrides_file_ha_config_replicas_passthrough(self):
+        """Explicit ha_config_replicas overrides the default."""
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode="w+", encoding="utf-8"
+        ) as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+                ha_config_replicas=5,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn("CL2_HA_CONFIG_REPLICAS: 5", content)
+        finally:
+            os.remove(tmp_path)
+
+
+class TestApiserverFailureTimingPickup(unittest.TestCase):
+    """collect_clusterloader2 appends a row from ApiserverFailureTimings_*.json
+    if it finds one in the report dir. This is the Phase 4b mechanism for
+    surfacing the killer script's recorded timestamps into the JSONL — vanilla
+    process_cl2_reports() doesn't recognize the file pattern.
+    """
+
+    def test_timing_file_appends_row(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            # Copy the mock report dir so we can add a timing file alongside.
+            src = os.path.join(MOCK_REPORT_ROOT, "mesh-1")
+            report_dir = os.path.join(tmp, "mesh-1")
+            shutil.copytree(src, report_dir)
+            timing_path = os.path.join(
+                report_dir, "ApiserverFailureTimings_clustermesh-1.json"
+            )
+            with open(timing_path, "w", encoding="utf-8") as f:
+                json.dump({
+                    "target_context": "clustermesh-1",
+                    "t0_kill_epoch": 1746000000,
+                    "t1_recovered_epoch": 1746000035,
+                    "recovery_duration_seconds": 35,
+                    "recovered": True,
+                    "killed_pod_name": "clustermesh-apiserver-abc",
+                    "killed_pod_uid": "old-uid",
+                    "replacement_pod_uid": "new-uid",
+                    "note": "ok",
+                }, f)
+
+            result_file = tempfile.mktemp(suffix=".jsonl")
+            try:
+                collect_clusterloader2(
+                    cl2_report_dir=report_dir,
+                    cloud_info="",
+                    run_id="apf-test",
+                    run_url="",
+                    result_file=result_file,
+                    test_type="apiserver-failure",
+                    start_timestamp="2026-05-12T20:00:00Z",
+                    cluster_name="mesh-1",
+                    cluster_count=2,
+                    mesh_size=2,
+                    namespaces=5,
+                    deployments_per_namespace=4,
+                    replicas_per_deployment=10,
+                    trigger_reason="Manual",
+                )
+                with open(result_file, "r", encoding="utf-8") as f:
+                    lines = [json.loads(l) for l in f.read().strip().split("\n")]
+                # At least one ApiserverFailureRecoveryTiming row appended
+                timing_rows = [
+                    r for r in lines
+                    if r.get("measurement") == "ApiserverFailureRecoveryTiming"
+                ]
+                self.assertEqual(len(timing_rows), 1)
+                tr = timing_rows[0]
+                self.assertEqual(tr["group"], "apiserver-failure")
+                self.assertEqual(tr["test_type"], "apiserver-failure")
+                self.assertEqual(tr["cluster"], "mesh-1")
+                self.assertEqual(tr["result"]["unit"], "seconds")
+                data = tr["result"]["data"]
+                self.assertEqual(data["target_context"], "clustermesh-1")
+                self.assertEqual(data["recovery_duration_seconds"], 35)
+                self.assertTrue(data["recovered"])
+            finally:
+                if os.path.exists(result_file):
+                    os.remove(result_file)
+
+    def test_no_timing_file_means_no_extra_row(self):
+        """Non-target clusters skip writing the timing file; collect must not
+        emit any ApiserverFailureRecoveryTiming row for those clusters.
+        """
+        result_file = tempfile.mktemp(suffix=".jsonl")
+        try:
+            collect_clusterloader2(
+                cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-2"),
+                cloud_info="",
+                run_id="apf-test-no-timing",
+                run_url="",
+                result_file=result_file,
+                test_type="apiserver-failure",
+                start_timestamp="2026-05-12T20:00:00Z",
+                cluster_name="mesh-2",
+                cluster_count=2,
+                mesh_size=2,
+                namespaces=5,
+                deployments_per_namespace=4,
+                replicas_per_deployment=10,
+                trigger_reason="Manual",
+            )
+            with open(result_file, "r", encoding="utf-8") as f:
+                lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+            timing_rows = [
+                r for r in lines
+                if r.get("measurement") == "ApiserverFailureRecoveryTiming"
+            ]
+            self.assertEqual(len(timing_rows), 0)
+        finally:
+            if os.path.exists(result_file):
+                os.remove(result_file)
+
+
+class TestHAConfigScalingTimingPickup(unittest.TestCase):
+    """collect_clusterloader2 appends a row from HAConfigScalingTimings_*.json
+    if it finds one in the report dir. ha-config-scaler.sh writes the file
+    on every cluster (not just target) — mesh-wide HA scaling.
+    """
+    def test_scaling_file_appends_row(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            src = os.path.join(MOCK_REPORT_ROOT, "mesh-1")
+            report_dir = os.path.join(tmp, "mesh-1")
+            shutil.copytree(src, report_dir)
+            scaling_path = os.path.join(
+                report_dir, "HAConfigScalingTimings_clustermesh-1.json"
+            )
+            with open(scaling_path, "w", encoding="utf-8") as f:
+                json.dump({
+                    "context": "clustermesh-1",
+                    "action": "scale-up",
+                    "requested_replicas": 3,
+                    "spec_replicas_after": 3,
+                    "ready_replicas_after": 3,
+                    "ha_replicas_honored": True,
+                    "scale_duration_seconds": 42,
+                    "note": "ok",
+                }, f)
+
+            result_file = tempfile.mktemp(suffix=".jsonl")
+            try:
+                collect_clusterloader2(
+                    cl2_report_dir=report_dir,
+                    cloud_info="",
+                    run_id="ha-test",
+                    run_url="",
+                    result_file=result_file,
+                    test_type="ha-config",
+                    start_timestamp="2026-05-13T20:00:00Z",
+                    cluster_name="mesh-1",
+                    cluster_count=2,
+                    mesh_size=2,
+                    namespaces=5,
+                    deployments_per_namespace=4,
+                    replicas_per_deployment=10,
+                    trigger_reason="Manual",
+                )
+                with open(result_file, "r", encoding="utf-8") as f:
+                    lines = [json.loads(l) for l in f.read().strip().split("\n")]
+                scaling_rows = [
+                    r for r in lines
+                    if r.get("measurement") == "HAConfigScalingTiming"
+                ]
+                self.assertEqual(len(scaling_rows), 1)
+                sr = scaling_rows[0]
+                self.assertEqual(sr["group"], "ha-config")
+                self.assertEqual(sr["test_type"], "ha-config")
+                self.assertEqual(sr["cluster"], "mesh-1")
+                self.assertEqual(sr["result"]["unit"], "seconds")
+                data = sr["result"]["data"]
+                self.assertEqual(data["requested_replicas"], 3)
+                self.assertEqual(data["spec_replicas_after"], 3)
+                self.assertTrue(data["ha_replicas_honored"])
+            finally:
+                if os.path.exists(result_file):
+                    os.remove(result_file)
+
+    def test_no_scaling_file_means_no_extra_row(self):
+        """Without a scaling JSON, no HAConfigScalingTiming row is emitted
+        (covers the non-ha-config scenario case, where the scaler isn't run).
+        """
+        result_file = tempfile.mktemp(suffix=".jsonl")
+        try:
+            collect_clusterloader2(
+                cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-2"),
+                cloud_info="",
+                run_id="ha-test-no-scaling",
+                run_url="",
+                result_file=result_file,
+                test_type="event-throughput",
+                start_timestamp="2026-05-13T20:00:00Z",
+                cluster_name="mesh-2",
+                cluster_count=2,
+                mesh_size=2,
+                namespaces=5,
+                deployments_per_namespace=4,
+                replicas_per_deployment=10,
+                trigger_reason="Manual",
+            )
+            with open(result_file, "r", encoding="utf-8") as f:
+                lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+            scaling_rows = [
+                r for r in lines
+                if r.get("measurement") == "HAConfigScalingTiming"
+            ]
+            self.assertEqual(len(scaling_rows), 0)
+        finally:
+            if os.path.exists(result_file):
+                os.remove(result_file)
+
+
+class TestConfigureNodeChurnKnobs(unittest.TestCase):
+    """Phase 4b — Scenario #3 (Node Churn / IP Churn) overrides flow through
+    configure_clusterloader2 and land in the CL2 overrides file with the
+    expected CL2_NODE_CHURN_* keys.
+    """
+
+    def test_node_churn_defaults_emitted(self):
+        """Defaults match scale.py argparse + node-churner.sh expectations."""
+        with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn("CL2_NODE_CHURN_TARGET_CONTEXT: clustermesh-1", content)
+            self.assertIn("CL2_NODE_CHURN_CYCLES: 3", content)
+            self.assertIn("CL2_NODE_CHURN_DELTA: 5", content)
+            self.assertIn("CL2_NODE_CHURN_SETTLE_SECONDS: 60", content)
+            self.assertIn("CL2_NODE_CHURN_SCALE_DURATION_SECONDS: 1800", content)
+            self.assertIn("CL2_NODE_CHURN_REPLACE_DURATION_SECONDS: 1500", content)
+            self.assertIn("CL2_NODE_CHURN_COMBINED_DURATION_SECONDS: 3300", content)
+            self.assertIn("CL2_NODE_REPLACE_BATCH_SIZE: 10", content)
+            self.assertIn("CL2_NODE_CHURN_READY_TIMEOUT_SECONDS: 300", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_node_churn_overrides_passthrough(self):
+        """Explicit kwargs override defaults; per-tier matrix overrides land."""
+        with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+                node_churn_target_context="clustermesh-7",
+                node_churn_cycles=5,
+                node_churn_delta=3,
+                node_churn_settle_seconds=90,
+                node_churn_scale_duration_seconds=2400,
+                node_churn_replace_duration_seconds=2000,
+                node_churn_combined_duration_seconds=4500,
+                node_replace_batch_size=8,
+                node_churn_ready_timeout_seconds=180,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn("CL2_NODE_CHURN_TARGET_CONTEXT: clustermesh-7", content)
+            self.assertIn("CL2_NODE_CHURN_CYCLES: 5", content)
+            self.assertIn("CL2_NODE_CHURN_DELTA: 3", content)
+            self.assertIn("CL2_NODE_CHURN_SETTLE_SECONDS: 90", content)
+            self.assertIn("CL2_NODE_CHURN_SCALE_DURATION_SECONDS: 2400", content)
+            self.assertIn("CL2_NODE_CHURN_REPLACE_DURATION_SECONDS: 2000", content)
+            self.assertIn("CL2_NODE_CHURN_COMBINED_DURATION_SECONDS: 4500", content)
+            self.assertIn("CL2_NODE_REPLACE_BATCH_SIZE: 8", content)
+            self.assertIn("CL2_NODE_CHURN_READY_TIMEOUT_SECONDS: 180", content)
+        finally:
+            os.remove(tmp_path)
+
+
+class TestNodeChurnTimingPickup(unittest.TestCase):
+    """collect_clusterloader2 appends one NodeChurnSummary row + one
+    NodeChurnOpTiming row per op from NodeChurnTimings_*.json. node-churner.sh
+    writes the file ONLY in the target cluster's report dir (the script runs
+    on the host, not inside CL2; the file lives in the target's per-cluster
+    report dir so the existing per-cluster collect pickup works).
+    """
+
+    def _write_timing(self, report_dir, target_context, ops=None,
+                      scenario="node-churn-combined",
+                      ready_quorum_reached=True,
+                      scenario_valid=True, cleanup_failed=False,
+                      truncated=False):
+        ops = ops or []
+        path = os.path.join(report_dir, f"NodeChurnTimings_{target_context}.json")
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({
+                "scenario": scenario,
+                "target_context": target_context,
+                "target_cluster_name": target_context,
+                "target_resource_group": "test-rg",
+                "target_nodepool": "default",
+                "target_node_resource_group": f"MC_test-rg_{target_context}_eastus2",
+                "target_vmss": "aks-default-12345",
+                "original_node_count": 20,
+                "ready_quorum_reached": ready_quorum_reached,
+                "scenario_valid": scenario_valid,
+                "cleanup_failed": cleanup_failed,
+                "truncated": truncated,
+                "started_epoch": 1746000000,
+                "ended_epoch": 1746001500,
+                "duration_seconds": 1500,
+                "ops": ops,
+            }, f)
+        return path
+
+    def test_timing_file_emits_summary_and_op_rows(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            src = os.path.join(MOCK_REPORT_ROOT, "mesh-1")
+            report_dir = os.path.join(tmp, "mesh-1")
+            shutil.copytree(src, report_dir)
+            self._write_timing(report_dir, "clustermesh-1", ops=[
+                {
+                    "op_index": 1, "op_type": "scale_up",
+                    "start_epoch": 1746000010, "end_epoch": 1746000200,
+                    "duration_seconds": 190, "succeeded": True,
+                    "observed_node_count": 25,
+                    "pre_ip_set": [], "post_ip_set": [], "new_ip_count": 0,
+                    "error": "",
+                },
+                {
+                    "op_index": 2, "op_type": "scale_down",
+                    "start_epoch": 1746000260, "end_epoch": 1746000450,
+                    "duration_seconds": 190, "succeeded": True,
+                    "observed_node_count": 20,
+                    "pre_ip_set": [], "post_ip_set": [], "new_ip_count": 0,
+                    "error": "",
+                },
+                {
+                    "op_index": 3, "op_type": "replace_wait",
+                    "start_epoch": 1746000500, "end_epoch": 1746001100,
+                    "duration_seconds": 600, "succeeded": True,
+                    "observed_node_count": 20,
+                    "pre_ip_set": ["10.1.0.4", "10.1.0.19"],
+                    "post_ip_set": ["10.1.0.4", "10.1.0.19"],
+                    "pre_node_names": ["aks-default-vmss000004", "aks-default-vmss00000j"],
+                    "post_node_names": ["aks-default-vmss000004", "aks-default-vmss00000k"],
+                    "new_ip_count": 0,
+                    "new_node_count": 1,
+                    "error": "",
+                },
+            ])
+            result_file = tempfile.mktemp(suffix=".jsonl")
+            try:
+                collect_clusterloader2(
+                    cl2_report_dir=report_dir,
+                    cloud_info="",
+                    run_id="nc-test",
+                    run_url="",
+                    result_file=result_file,
+                    test_type="node-churn-combined",
+                    start_timestamp="2026-05-13T20:00:00Z",
+                    cluster_name="mesh-1",
+                    cluster_count=2,
+                    mesh_size=2,
+                    namespaces=5,
+                    deployments_per_namespace=4,
+                    replicas_per_deployment=10,
+                    trigger_reason="Manual",
+                )
+                with open(result_file, "r", encoding="utf-8") as f:
+                    lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+                summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"]
+                ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"]
+                self.assertEqual(len(summary), 1)
+                self.assertEqual(len(ops), 3)
+                s = summary[0]
+                self.assertEqual(s["group"], "node-churn-combined")
+                self.assertEqual(s["test_type"], "node-churn-combined")
+                self.assertEqual(s["cluster"], "mesh-1")
+                self.assertEqual(s["result"]["data"]["op_count"], 3)
+                self.assertEqual(s["result"]["data"]["original_node_count"], 20)
+                self.assertTrue(s["result"]["data"]["ready_quorum_reached"])
+                self.assertTrue(s["result"]["data"]["scenario_valid"])
+                # ops sorted by op_index
+                op_types = [o["result"]["data"]["op_type"] for o in ops]
+                self.assertEqual(set(op_types), {"scale_up", "scale_down", "replace_wait"})
+                # scenario-level context merged onto op rows
+                for op_row in ops:
+                    self.assertEqual(op_row["result"]["data"]["scenario"], "node-churn-combined")
+                    self.assertEqual(op_row["result"]["data"]["target_context"], "clustermesh-1")
+                # replace_wait op carries IP set + node name deltas.
+                # Build 67155: new_ip_count is informational (Azure can reuse IPs);
+                # new_node_count is the authoritative replacement signal.
+                replace = [o for o in ops if o["result"]["data"]["op_type"] == "replace_wait"][0]
+                self.assertEqual(replace["result"]["data"]["new_ip_count"], 0)
+                self.assertEqual(replace["result"]["data"]["new_node_count"], 1,
+                                 "node name delta is the authoritative replacement signal")
+                self.assertIn("aks-default-vmss00000k",
+                              replace["result"]["data"]["post_node_names"])
+            finally:
+                if os.path.exists(result_file):
+                    os.remove(result_file)
+
+    def test_timing_file_with_empty_ops_emits_summary_only(self):
+        """Ready-quorum-never-reached case: timing file exists with ops=[],
+        scenario_valid=false. Summary row still emitted so Kusto can detect
+        the aborted run; no op rows."""
+        with tempfile.TemporaryDirectory() as tmp:
+            src = os.path.join(MOCK_REPORT_ROOT, "mesh-1")
+            report_dir = os.path.join(tmp, "mesh-1")
+            shutil.copytree(src, report_dir)
+            self._write_timing(
+                report_dir, "clustermesh-1", ops=[],
+                ready_quorum_reached=False, scenario_valid=False,
+            )
+            result_file = tempfile.mktemp(suffix=".jsonl")
+            try:
+                collect_clusterloader2(
+                    cl2_report_dir=report_dir,
+                    cloud_info="",
+                    run_id="nc-test-abort",
+                    run_url="",
+                    result_file=result_file,
+                    test_type="node-churn-scale",
+                    start_timestamp="2026-05-13T20:00:00Z",
+                    cluster_name="mesh-1",
+                    cluster_count=2,
+                    mesh_size=2,
+                    namespaces=5,
+                    deployments_per_namespace=4,
+                    replicas_per_deployment=10,
+                    trigger_reason="Manual",
+                )
+                with open(result_file, "r", encoding="utf-8") as f:
+                    lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+                summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"]
+                ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"]
+                self.assertEqual(len(summary), 1)
+                self.assertEqual(len(ops), 0)
+                self.assertFalse(summary[0]["result"]["data"]["ready_quorum_reached"])
+                self.assertFalse(summary[0]["result"]["data"]["scenario_valid"])
+                self.assertEqual(summary[0]["result"]["data"]["op_count"], 0)
+            finally:
+                if os.path.exists(result_file):
+                    os.remove(result_file)
+
+    def test_timing_file_with_cleanup_failed_marks_summary(self):
+        """If node-churner finalizer can't restore the pool, cleanup_failed=true.
+        execute.yml uses this to break the share-infra loop; collect must still
+        emit the summary row with cleanup_failed=true visible."""
+        with tempfile.TemporaryDirectory() as tmp:
+            src = os.path.join(MOCK_REPORT_ROOT, "mesh-1")
+            report_dir = os.path.join(tmp, "mesh-1")
+            shutil.copytree(src, report_dir)
+            self._write_timing(
+                report_dir, "clustermesh-1",
+                ops=[{
+                    "op_index": 1, "op_type": "scale_up",
+                    "start_epoch": 1746000010, "end_epoch": 1746000200,
+                    "duration_seconds": 190, "succeeded": False,
+                    "observed_node_count": 0,
+                    "pre_ip_set": [], "post_ip_set": [], "new_ip_count": 0,
+                    "error": "OperationNotAllowed",
+                }],
+                cleanup_failed=True, scenario_valid=False,
+            )
+            result_file = tempfile.mktemp(suffix=".jsonl")
+            try:
+                collect_clusterloader2(
+                    cl2_report_dir=report_dir,
+                    cloud_info="",
+                    run_id="nc-test-cleanup",
+                    run_url="",
+                    result_file=result_file,
+                    test_type="node-churn-combined",
+                    start_timestamp="2026-05-13T20:00:00Z",
+                    cluster_name="mesh-1",
+                    cluster_count=2,
+                    mesh_size=2,
+                    namespaces=5,
+                    deployments_per_namespace=4,
+                    replicas_per_deployment=10,
+                    trigger_reason="Manual",
+                )
+                with open(result_file, "r", encoding="utf-8") as f:
+                    lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+                summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"]
+                self.assertEqual(len(summary), 1)
+                self.assertTrue(summary[0]["result"]["data"]["cleanup_failed"])
+                # failed op still surfaces with succeeded=false
+                ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"]
+                self.assertEqual(len(ops), 1)
+                self.assertFalse(ops[0]["result"]["data"]["succeeded"])
+                self.assertIn("OperationNotAllowed", ops[0]["result"]["data"]["error"])
+            finally:
+                if os.path.exists(result_file):
+                    os.remove(result_file)
+
+    def test_no_timing_file_means_no_node_churn_rows(self):
+        """Non-target clusters (and non-node-churn scenarios) skip writing
+        the timing file → no NodeChurnSummary / NodeChurnOpTiming rows."""
+        result_file = tempfile.mktemp(suffix=".jsonl")
+        try:
+            collect_clusterloader2(
+                cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-2"),
+                cloud_info="",
+                run_id="nc-test-no-timing",
+                run_url="",
+                result_file=result_file,
+                test_type="node-churn-scale",
+                start_timestamp="2026-05-13T20:00:00Z",
+                cluster_name="mesh-2",
+                cluster_count=2,
+                mesh_size=2,
+                namespaces=5,
+                deployments_per_namespace=4,
+                replicas_per_deployment=10,
+                trigger_reason="Manual",
+            )
+            with open(result_file, "r", encoding="utf-8") as f:
+                lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+            summary = [r for r in lines if r.get("measurement") == "NodeChurnSummary"]
+            ops = [r for r in lines if r.get("measurement") == "NodeChurnOpTiming"]
+            self.assertEqual(len(summary), 0)
+            self.assertEqual(len(ops), 0)
+        finally:
+            if os.path.exists(result_file):
+                os.remove(result_file)
+
+
+class TestWriteReadySentinelScript(unittest.TestCase):
+    """write-ready-sentinel.sh derives a unique context per CL2 invocation
+    and writes a non-empty sentinel filename. Build 67114 regression: the
+    original inline `bash -c` Method:Exec returned an empty context name,
+    causing both clusters to write the same path (ready-) and one to
+    overwrite the other → barrier saw 1/2 → scenario aborted.
+
+    The fix relies on parsing /root/.kube/config directly (CL2 bind-mounts
+    the per-cluster kubeconfig there). These tests confirm the resolution
+    chain (kubeconfig-parse > kubectl-PATH > kubectl-prestaged > server-hash
+    > hostname > pid-fallback) and that the sentinel filename always has
+    a non-empty suffix.
+    """
+
+    SCRIPT_PATH = (
+        Path(__file__).resolve().parents[1]
+        / "clusterloader2" / "clustermesh-scale" / "config" / "write-ready-sentinel.sh"
+    )
+
+    def _run_with_kubeconfig(self, kubeconfig_content, td):
+        import subprocess
+        kubeconfig = os.path.join(td, "kubeconfig")
+        with open(kubeconfig, "w", encoding="utf-8") as f:
+            f.write(kubeconfig_content)
+        sentinel_dir = os.path.join(td, "sentinels")
+        os.makedirs(sentinel_dir, exist_ok=True)
+        env = os.environ.copy()
+        env["KUBECONFIG"] = kubeconfig
+        result = subprocess.run(
+            ["bash", str(self.SCRIPT_PATH), sentinel_dir],
+            capture_output=True, text=True, env=env, check=False,
+            timeout=10,
+        )
+        return result, sentinel_dir
+
+    def test_kubeconfig_parse_resolves_current_context(self):
+        kc = (
+            "apiVersion: v1\n"
+            "clusters:\n"
+            "- cluster:\n"
+            "    server: https://test1.example.com:443\n"
+            "  name: clustermesh-1\n"
+            "contexts:\n"
+            "- context:\n"
+            "    cluster: clustermesh-1\n"
+            "  name: clustermesh-1\n"
+            "current-context: clustermesh-1\n"
+        )
+        with tempfile.TemporaryDirectory() as td:
+            result, sentinel_dir = self._run_with_kubeconfig(kc, td)
+            self.assertEqual(result.returncode, 0, f"stderr={result.stderr}")
+            files = os.listdir(sentinel_dir)
+            self.assertEqual(files, ["ready-clustermesh-1"])
+            self.assertIn("via kubeconfig-parse", result.stderr)
+
+    def test_different_kubeconfigs_yield_distinct_sentinels(self):
+        """Build 67114 regression: two clusters MUST NOT write the same
+        sentinel path (otherwise the second's write silently overwrites
+        the first, breaking the quorum count)."""
+        kc1 = "current-context: clustermesh-1\n"
+        kc2 = "current-context: clustermesh-2\n"
+        with tempfile.TemporaryDirectory() as td1, tempfile.TemporaryDirectory() as td2:
+            r1, sd1 = self._run_with_kubeconfig(kc1, td1)
+            r2, sd2 = self._run_with_kubeconfig(kc2, td2)
+            self.assertEqual(r1.returncode, 0)
+            self.assertEqual(r2.returncode, 0)
+            self.assertEqual(os.listdir(sd1), ["ready-clustermesh-1"])
+            self.assertEqual(os.listdir(sd2), ["ready-clustermesh-2"])
+
+    def test_empty_current_context_falls_back_to_server_hash(self):
+        """If current-context line is missing/blank, fall back to a hash of
+        the server URL. Two different servers MUST yield different hashes."""
+        kc1 = (
+            "apiVersion: v1\n"
+            "clusters:\n"
+            "- cluster:\n"
+            "    server: https://serverA.example.com:443\n"
+            "  name: foo\n"
+        )
+        kc2 = (
+            "apiVersion: v1\n"
+            "clusters:\n"
+            "- cluster:\n"
+            "    server: https://serverB.example.com:443\n"
+            "  name: foo\n"
+        )
+        with tempfile.TemporaryDirectory() as td1, tempfile.TemporaryDirectory() as td2:
+            r1, sd1 = self._run_with_kubeconfig(kc1, td1)
+            r2, sd2 = self._run_with_kubeconfig(kc2, td2)
+            self.assertEqual(r1.returncode, 0)
+            self.assertEqual(r2.returncode, 0)
+            f1 = os.listdir(sd1)[0]
+            f2 = os.listdir(sd2)[0]
+            self.assertNotEqual(f1, f2,
+                                f"server-hash collision: {f1} == {f2}")
+
+    def test_sentinel_filename_always_non_empty_suffix(self):
+        """Whatever the resolution path, the sentinel filename suffix is
+        never empty (avoids the build 67114 path-collision regression)."""
+        kc = ""
+        with tempfile.TemporaryDirectory() as td:
+            r, sd = self._run_with_kubeconfig(kc, td)
+            self.assertEqual(r.returncode, 0, f"stderr={r.stderr}")
+            files = os.listdir(sd)
+            self.assertEqual(len(files), 1)
+            self.assertNotEqual(files[0], "ready-",
+                                "sentinel filename has empty suffix — build 67114 regression")
+            self.assertTrue(files[0].startswith("ready-"))
+            self.assertGreater(len(files[0]), len("ready-"))
+
+
+class TestNodeChurnerScript(unittest.TestCase):
+    """node-churner.sh smoke tests — bash -n syntax + arg validation. The
+    script's full Azure CLI behavior cannot be unit-tested without mocking
+    the cloud, but its argparse-equivalent + missing-binary fail-soft path
+    can.
+    """
+
+    SCRIPT_PATH = (
+        Path(__file__).resolve().parents[1]
+        / "clusterloader2" / "clustermesh-scale" / "config" / "node-churner.sh"
+    )
+
+    def test_script_exists_and_is_executable(self):
+        self.assertTrue(self.SCRIPT_PATH.exists(),
+                        f"{self.SCRIPT_PATH} should exist")
+        self.assertTrue(
+            os.access(self.SCRIPT_PATH, os.X_OK),
+            f"{self.SCRIPT_PATH} must be executable",
+        )
+
+    def test_script_bash_syntax(self):
+        import subprocess
+        result = subprocess.run(
+            ["bash", "-n", str(self.SCRIPT_PATH)],
+            capture_output=True, text=True, check=False,
+        )
+        self.assertEqual(result.returncode, 0,
+                         f"bash -n failed: stderr={result.stderr}")
+
+    def test_script_aborts_softly_when_az_missing(self):
+        """When `az` CLI isn't on PATH, the script writes a timing file with
+        scenario_valid=false instead of erroring out (so execute.yml's
+        share-infra loop continues to subsequent scenarios with clean data).
+        """
+        import subprocess
+        with tempfile.TemporaryDirectory() as tmp:
+            report_dir = os.path.join(tmp, "report")
+            sentinel_dir = os.path.join(tmp, "sentinels")
+            os.makedirs(report_dir, exist_ok=True)
+            os.makedirs(sentinel_dir, exist_ok=True)
+            env = os.environ.copy()
+            env["PATH"] = "/usr/bin:/bin"  # strip out any az
+            result = subprocess.run(
+                [
+                    "bash", str(self.SCRIPT_PATH),
+                    "node-churn-scale",   # scenario
+                    "clustermesh-1",      # target cluster name
+                    "test-rg",            # target rg
+                    "default",            # target nodepool
+                    report_dir,           # report dir
+                    sentinel_dir,         # sentinel dir
+                    "2",                  # cluster count
+                    "1", "1", "1", "1", "30", "60",  # remaining knobs
+                ],
+                capture_output=True, text=True, env=env, check=False,
+                timeout=30,
+            )
+            # Soft-fail contract: exit 0 even when az is missing.
+            self.assertEqual(result.returncode, 0,
+                             f"expected soft-fail (rc=0); got rc={result.returncode}, "
+                             f"stderr={result.stderr}")
+            timing_file = os.path.join(report_dir, "NodeChurnTimings_clustermesh-1.json")
+            self.assertTrue(os.path.exists(timing_file),
+                            "timing file should still be written on soft-fail")
+            with open(timing_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            self.assertFalse(data["scenario_valid"],
+                             "scenario_valid must be false when az is missing")
+
 
 class TestCollectSingleCluster(unittest.TestCase):
     """collect_clusterloader2 emits one JSONL row per call, tagged with cluster identity."""
@@ -221,6 +1158,135 @@ def test_collect_propagates_test_type(self):
             if os.path.exists(result_file):
                 os.remove(result_file)
 
+    def test_collect_records_pod_churn_knobs(self):
+        """Phase 4a — pod-churn scenarios record churn knobs on every row.
+
+        Spec line 67 ("CPU/memory growth over time") requires historical
+        comparison across runs with potentially-different churn parameters.
+        Recording the knobs on the row means a future query for
+        ``churn_cycles==5 AND kill_batch==5`` returns only directly-comparable
+        rows. Non-churn test_types default to 0/"" — Kusto-friendly nulls.
+        """
+        result_file = tempfile.mktemp(suffix=".jsonl")
+        try:
+            collect_clusterloader2(
+                cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-1"),
+                cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}),
+                run_id="test-run-churn",
+                run_url="http://example.com/runchurn",
+                result_file=result_file,
+                test_type="pod-churn-scale",
+                start_timestamp="2026-04-28T15:00:00Z",
+                cluster_name="mesh-1",
+                cluster_count=2,
+                mesh_size=2,
+                namespaces=5,
+                deployments_per_namespace=4,
+                replicas_per_deployment=10,
+                trigger_reason="Manual",
+                churn_cycles=5,
+                churn_up_duration="60s",
+                churn_down_duration="60s",
+                kill_duration_seconds=600,
+                kill_interval_seconds=10,
+                kill_batch=5,
+            )
+            with open(result_file, "r", encoding="utf-8") as f:
+                row = json.loads(f.read().strip().split("\n")[0])
+            # Top-level fields — Kusto column convenience.
+            self.assertEqual(row["churn_cycles"], 5)
+            self.assertEqual(row["kill_duration_seconds"], 600)
+            self.assertEqual(row["kill_interval_seconds"], 10)
+            self.assertEqual(row["kill_batch"], 5)
+            # Nested in test_details for richer queries.
+            details = row["test_details"]
+            self.assertEqual(details["churn_cycles"], 5)
+            self.assertEqual(details["churn_up_duration"], "60s")
+            self.assertEqual(details["churn_down_duration"], "60s")
+            self.assertEqual(details["kill_duration_seconds"], 600)
+            self.assertEqual(details["kill_interval_seconds"], 10)
+            self.assertEqual(details["kill_batch"], 5)
+        finally:
+            if os.path.exists(result_file):
+                os.remove(result_file)
+
+    def test_collect_pod_churn_knobs_default_to_zero_for_non_churn_runs(self):
+        """Non-churn collect calls omit the churn knobs; defaults must be 0/""
+        so the JSONL row is still schema-stable for Kusto (no missing fields).
+        """
+        result_file = self._collect(cluster_name="mesh-1", test_type="event-throughput")
+        try:
+            with open(result_file, "r", encoding="utf-8") as f:
+                row = json.loads(f.read().strip().split("\n")[0])
+            self.assertEqual(row["churn_cycles"], 0)
+            self.assertEqual(row["kill_duration_seconds"], 0)
+            self.assertEqual(row["kill_interval_seconds"], 0)
+            self.assertEqual(row["kill_batch"], 0)
+            self.assertEqual(row["test_details"]["churn_up_duration"], "")
+            self.assertEqual(row["test_details"]["churn_down_duration"], "")
+        finally:
+            if os.path.exists(result_file):
+                os.remove(result_file)
+
+    def test_collect_skips_any_subdir_under_report_dir(self):
+        """process_cl2_reports open()s every dir entry, so ANY subdir trips it.
+
+        Today only logs/ exists (pod log capture from run-cl2-on-cluster.sh).
+        Tomorrow could be phase-logs/ from a CL2 version bump, additional
+        diag dumps, etc. collect_clusterloader2 must stash every subdir
+        outside the report dir during the parse and restore each one
+        afterward so the pipeline-level artifact publish still picks them up.
+        """
+        src = os.path.join(MOCK_REPORT_ROOT, "mesh-1")
+        with tempfile.TemporaryDirectory() as tmp:
+            report_dir = os.path.join(tmp, "mesh-1")
+            shutil.copytree(src, report_dir)
+            # mesh-1 fixture already ships logs/; add two more synthetic
+            # subdirs to lock in the "skip ALL subdirs" contract.
+            extra_subdirs = {
+                "phase-logs": "phase-0.log",
+                "diag-dump": "events.txt",
+            }
+            for sd, fname in extra_subdirs.items():
+                sd_path = os.path.join(report_dir, sd)
+                os.makedirs(sd_path, exist_ok=True)
+                with open(os.path.join(sd_path, fname), "w", encoding="utf-8") as f:
+                    f.write(f"synthetic {sd}/{fname}\n")
+
+            result_file = tempfile.mktemp(suffix=".jsonl")
+            try:
+                collect_clusterloader2(
+                    cl2_report_dir=report_dir,
+                    cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}),
+                    run_id="test-run-subdirs",
+                    run_url="http://example.com/runsubdirs",
+                    result_file=result_file,
+                    test_type="unit-test",
+                    start_timestamp="2026-04-28T15:00:00Z",
+                    cluster_name="mesh-1",
+                    cluster_count=2,
+                    mesh_size=2,
+                    namespaces=1,
+                    deployments_per_namespace=1,
+                    replicas_per_deployment=1,
+                    trigger_reason="Manual",
+                )
+                self.assertTrue(os.path.exists(result_file))
+                with open(result_file, "r", encoding="utf-8") as f:
+                    self.assertGreater(len(f.read()), 0)
+                # All three subdirs (mock logs/ + 2 synthetic) restored
+                # at original location with contents intact.
+                self.assertTrue(os.path.isdir(os.path.join(report_dir, "logs")))
+                for sd, fname in extra_subdirs.items():
+                    self.assertTrue(os.path.isdir(os.path.join(report_dir, sd)),
+                                    f"{sd}/ missing after collect")
+                    nested = os.path.join(report_dir, sd, fname)
+                    self.assertTrue(os.path.isfile(nested),
+                                    f"{nested} missing after collect")
+            finally:
+                if os.path.exists(result_file):
+                    os.remove(result_file)
+
 
 class TestCollectMultiCluster(unittest.TestCase):
     """The multi-cluster aggregation invariant — the reason this scenario exists.
@@ -339,7 +1405,34 @@ def test_configure_command_parsing(self, mock_configure):
         ]
         with patch.object(sys, "argv", test_args):
             main()
-        mock_configure.assert_called_once_with(2, 3, 4, "20m", "/tmp/overrides.yaml")
+        mock_configure.assert_called_once_with(
+            2, 3, 4, "20m", "/tmp/overrides.yaml",
+            churn_cycles=5,
+            churn_up_duration="60s",
+            churn_down_duration="60s",
+            kill_duration="10m",
+            kill_interval_seconds=10,
+            kill_batch=5,
+            kill_duration_seconds=600,
+            kill_job_deadline_seconds=660,
+            apiserver_kill_target_context="clustermesh-1",
+            apiserver_kill_recovery_timeout_seconds=240,
+            apiserver_kill_observation_seconds=60,
+            ha_config_replicas=3,
+            node_churn_target_context="clustermesh-1",
+            node_churn_cycles=3,
+            node_churn_delta=5,
+            node_churn_settle_seconds=60,
+            node_churn_scale_duration_seconds=1800,
+            node_churn_replace_duration_seconds=1500,
+            node_churn_combined_duration_seconds=3300,
+            node_replace_batch_size=10,
+            node_churn_ready_timeout_seconds=300,
+            saturation_qps_list="100,500,1500,4000,10000",
+            saturation_restarts_list="2,4,8,15,25",
+            saturation_rung_duration_seconds=240,
+            saturation_settle_seconds=90,
+        )
 
     @patch.object(clustermesh_scale_module, "execute_clusterloader2")
     def test_execute_command_parsing(self, mock_execute):
@@ -363,6 +1456,7 @@ def test_execute_command_parsing(self, mock_execute):
             "config.yaml",
             "/path/to/kubeconfig",
             "aks",
+            tear_down_prometheus=False,
         )
 
     @patch.object(clustermesh_scale_module, "collect_clusterloader2")
@@ -403,7 +1497,1182 @@ def test_collect_command_parsing(self, mock_collect):
             1,
             1,
             "Manual",
+            churn_cycles=0,
+            churn_up_duration="",
+            churn_down_duration="",
+            kill_duration_seconds=0,
+            kill_interval_seconds=0,
+            kill_batch=0,
+            saturation_qps_list="",
+            saturation_restarts_list="",
+        )
+
+    @patch.object(clustermesh_scale_module, "execute_parallel")
+    def test_execute_parallel_command_parsing(self, mock_exec_parallel):
+        """`execute-parallel` subcommand wires CLI args through and exits with returned rc."""
+        mock_exec_parallel.return_value = 0
+        test_args = [
+            "clustermesh-scale/scale.py",
+            "execute-parallel",
+            "--clusters", "/tmp/clusters.json",
+            "--max-concurrent", "3",
+            "--worker-script", "/path/to/run-cl2-on-cluster.sh",
+            "--cl2-image", "ghcr.io/azure/clusterloader2:v20250513",
+            "--cl2-config-dir", "/path/to/config",
+            "--cl2-config-file", "config.yaml",
+            "--cl2-report-dir-base", "/path/to/results",
+            "--provider", "aks",
+            "--python-script-file", "/path/to/scale.py",
+            "--python-workdir", "/path/to/modules/python",
+        ]
+        with patch.object(sys, "argv", test_args):
+            with self.assertRaises(SystemExit) as cm:
+                main()
+            self.assertEqual(cm.exception.code, 0)
+        mock_exec_parallel.assert_called_once_with(
+            clusters_file="/tmp/clusters.json",
+            max_concurrent=3,
+            worker_script="/path/to/run-cl2-on-cluster.sh",
+            cl2_image="ghcr.io/azure/clusterloader2:v20250513",
+            cl2_config_dir="/path/to/config",
+            cl2_config_file="config.yaml",
+            cl2_report_dir_base="/path/to/results",
+            provider="aks",
+            python_script_file="/path/to/scale.py",
+            python_workdir="/path/to/modules/python",
+            tear_down_prometheus=False,
+        )
+
+    @patch.object(clustermesh_scale_module, "execute_parallel")
+    def test_execute_parallel_default_max_concurrent_is_4(self, mock_exec_parallel):
+        """Default --max-concurrent matches the plan.md Phase 3 spec value (4)."""
+        mock_exec_parallel.return_value = 0
+        test_args = [
+            "clustermesh-scale/scale.py",
+            "execute-parallel",
+            "--clusters", "/tmp/c.json",
+            "--worker-script", "/w.sh",
+            "--cl2-image", "img",
+            "--cl2-config-dir", "/cfg",
+            "--cl2-config-file", "config.yaml",
+            "--cl2-report-dir-base", "/r",
+            "--provider", "aks",
+            "--python-script-file", "/s.py",
+            "--python-workdir", "/wd",
+        ]
+        with patch.object(sys, "argv", test_args):
+            with self.assertRaises(SystemExit):
+                main()
+        self.assertEqual(mock_exec_parallel.call_args.kwargs["max_concurrent"], 4)
+
+    @patch.object(clustermesh_scale_module, "execute_parallel")
+    def test_execute_parallel_propagates_nonzero_exit(self, mock_exec_parallel):
+        """If execute_parallel returns nonzero, main() exits nonzero so the AzDO step fails."""
+        mock_exec_parallel.return_value = 1
+        test_args = [
+            "clustermesh-scale/scale.py",
+            "execute-parallel",
+            "--clusters", "/tmp/c.json",
+            "--worker-script", "/w.sh",
+            "--cl2-image", "img",
+            "--cl2-config-dir", "/cfg",
+            "--cl2-config-file", "config.yaml",
+            "--cl2-report-dir-base", "/r",
+            "--provider", "aks",
+            "--python-script-file", "/s.py",
+            "--python-workdir", "/wd",
+        ]
+        with patch.object(sys, "argv", test_args):
+            with self.assertRaises(SystemExit) as cm:
+                main()
+            self.assertEqual(cm.exception.code, 1)
+
+    @patch.object(clustermesh_scale_module, "execute_parallel")
+    def test_execute_parallel_tear_down_prometheus_flag(self, mock_exec_parallel):
+        """--tear-down-prometheus flag flows through to execute_parallel.
+
+        Used by share-infra mode (multiple scenarios per provision/destroy
+        lifecycle) so each scenario's CL2 invocation deploys a fresh
+        Prometheus stack rather than colliding with the previous scenario's
+        leftover Prom resources.
+        """
+        mock_exec_parallel.return_value = 0
+        test_args_off = [
+            "clustermesh-scale/scale.py", "execute-parallel",
+            "--clusters", "/tmp/c.json", "--worker-script", "/w.sh",
+            "--cl2-image", "img", "--cl2-config-dir", "/cfg",
+            "--cl2-config-file", "config.yaml", "--cl2-report-dir-base", "/r",
+            "--provider", "aks", "--python-script-file", "/s.py", "--python-workdir", "/wd",
+        ]
+        with patch.object(sys, "argv", test_args_off):
+            with self.assertRaises(SystemExit):
+                main()
+        self.assertEqual(
+            mock_exec_parallel.call_args.kwargs["tear_down_prometheus"], False)
+
+        mock_exec_parallel.reset_mock()
+        with patch.object(sys, "argv", test_args_off + ["--tear-down-prometheus"]):
+            with self.assertRaises(SystemExit):
+                main()
+        self.assertEqual(
+            mock_exec_parallel.call_args.kwargs["tear_down_prometheus"], True)
+
+
+class _FakePopen:
+    """Test double for subprocess.Popen used in execute_parallel tests.
+
+    Records construction args, fakes a streamable stdout, sleeps inside wait()
+    to force temporal overlap (so concurrency tests can observe max_active),
+    and decrements an active counter on wait so the parent observes correct
+    in-flight counts.
+
+    Class attributes (lock, counters, instances) are intentionally public —
+    the class itself is "private" via the leading underscore, and tests
+    inspect this state directly to assert concurrency invariants.
+    """
+
+    # Class-level state mutated across instances by the test runner.
+    lock = threading.Lock()
+    active_now = 0
+    max_active = 0
+    instances = []  # list of FakePopen instances created
+    wait_seconds = 0.05  # how long each fake CL2 "runs" in wait()
+    # Per-role configuration: role -> (stdout_lines, exit_code)
+    role_config = {}
+    default_exit = 0
+    default_stdout = []
+
+    @classmethod
+    def reset(cls, *, wait_seconds=0.05, role_config=None,
+              default_stdout=None, default_exit=0):
+        cls.active_now = 0
+        cls.max_active = 0
+        cls.instances = []
+        cls.wait_seconds = wait_seconds
+        cls.role_config = role_config or {}
+        cls.default_stdout = default_stdout or []
+        cls.default_exit = default_exit
+
+    def __init__(self, args, **kwargs):
+        # args is e.g. ["bash", worker_script, role, kubeconfig, ...]
+        self.args = args
+        self.kwargs = kwargs
+        self.returncode = None
+        self.role = args[2] if len(args) >= 3 else None
+        lines, exit_code = self.__class__.role_config.get(
+            self.role, (self.__class__.default_stdout, self.__class__.default_exit)
+        )
+        # Provide an iterator over the staged lines so `for line in proc.stdout`
+        # in _run_one_cluster yields them once.
+        self.stdout = iter(lines)
+        self.exit_code = exit_code
+        with self.__class__.lock:
+            self.__class__.instances.append(self)
+            self.__class__.active_now += 1
+            self.__class__.max_active = max(
+                self.__class__.max_active, self.__class__.active_now
+            )
+
+    def wait(self, timeout=None):  # pylint: disable=unused-argument
+        # Sleep so peer workers have a chance to enter wait() concurrently.
+        # Without this overlap window, the test couldn't distinguish parallel
+        # execution from sequential.
+        time.sleep(self.__class__.wait_seconds)
+        with self.__class__.lock:
+            self.__class__.active_now -= 1
+        self.returncode = self.exit_code
+        return self.exit_code
+
+    def terminate(self):
+        # No-op for tests — execute_parallel only terminates on signal,
+        # which we don't trigger from these tests.
+        pass
+
+
+class TestExecuteParallel(unittest.TestCase):
+    """execute_parallel fans out CL2 across N clusters with bounded concurrency.
+
+    Validates the contract per plan.md Phase 3: bounded concurrent CL2
+    invocations, per-cluster pass/fail aggregation, AzDO ##vso service
+    messages preserved without [role] prefix, sensible validation errors.
+    """
+
+    def setUp(self):
+        # Replace signal install with a no-op — installing real handlers in
+        # unit tests can interact badly with pytest's signal handling.
+        self._signal_patcher = patch.object(
+            clustermesh_scale_module, "_install_parallel_signal_handlers", lambda: None
+        )
+        self._signal_patcher.start()
+
+    def tearDown(self):
+        self._signal_patcher.stop()
+
+    def _write_clusters(self, clusters):
+        path = tempfile.mktemp(suffix=".json")
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(clusters, f)
+        return path
+
+    def _call_execute_parallel(self, clusters_file, max_concurrent=4):
+        return clustermesh_scale_module.execute_parallel(
+            clusters_file=clusters_file,
+            max_concurrent=max_concurrent,
+            worker_script="/path/to/run-cl2-on-cluster.sh",
+            cl2_image="img",
+            cl2_config_dir="/cfg",
+            cl2_config_file="config.yaml",
+            cl2_report_dir_base="/r",
+            provider="aks",
+            python_script_file="/scale.py",
+            python_workdir="/wd",
+        )
+
+    def test_dispatches_one_subprocess_per_cluster(self):
+        """N clusters → N Popen calls, each carrying that cluster's role + kubeconfig."""
+        clusters = [
+            {"role": "mesh-1", "kubeconfig": "/home/.kube/mesh-1.config"},
+            {"role": "mesh-2", "kubeconfig": "/home/.kube/mesh-2.config"},
+            {"role": "mesh-3", "kubeconfig": "/home/.kube/mesh-3.config"},
+        ]
+        cf = self._write_clusters(clusters)
+        try:
+            _FakePopen.reset(wait_seconds=0)
+            with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen):
+                rc = self._call_execute_parallel(cf)
+            self.assertEqual(rc, 0)
+            self.assertEqual(len(_FakePopen.instances), 3)
+            # Each invocation passes role + kubeconfig in the bash worker arg
+            # vector. args layout: ["bash", worker_script, role, kubeconfig,
+            # report_dir, cl2_image, cl2_config_dir, cl2_config_file, provider,
+            # python_script_file, python_workdir]
+            roles_seen = {p.args[2] for p in _FakePopen.instances}
+            self.assertEqual(roles_seen, {"mesh-1", "mesh-2", "mesh-3"})
+            for p in _FakePopen.instances:
+                role = p.args[2]
+                self.assertEqual(p.args[3], f"/home/.kube/{role}.config")
+                # report_dir is base/role
+                self.assertEqual(p.args[4], f"/r/{role}")
+        finally:
+            os.remove(cf)
+
+    def test_all_zero_exit_codes_yield_overall_success(self):
+        """If every per-cluster worker exits 0, execute_parallel returns 0."""
+        clusters = [
+            {"role": "mesh-1", "kubeconfig": "/k1"},
+            {"role": "mesh-2", "kubeconfig": "/k2"},
+        ]
+        cf = self._write_clusters(clusters)
+        try:
+            _FakePopen.reset(wait_seconds=0, default_exit=0)
+            with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen):
+                rc = self._call_execute_parallel(cf)
+            self.assertEqual(rc, 0)
+        finally:
+            os.remove(cf)
+
+    def test_any_nonzero_exit_yields_overall_failure(self):
+        """If ANY per-cluster worker exits non-zero, execute_parallel returns 1.
+
+        Mirrors the sequential bash behavior (`if failures > 0; exit 1`) so
+        the AzDO step's pass/fail signal is unchanged from before parallel
+        fan-out. Other clusters still complete (no early cancellation).
+        """
+        clusters = [
+            {"role": "mesh-1", "kubeconfig": "/k1"},
+            {"role": "mesh-2", "kubeconfig": "/k2"},
+            {"role": "mesh-3", "kubeconfig": "/k3"},
+        ]
+        cf = self._write_clusters(clusters)
+        try:
+            _FakePopen.reset(
+                wait_seconds=0,
+                role_config={
+                    "mesh-1": ([], 0),
+                    "mesh-2": ([], 1),  # this one fails
+                    "mesh-3": ([], 0),
+                },
+            )
+            with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen):
+                rc = self._call_execute_parallel(cf)
+            self.assertEqual(rc, 1)
+            # All three workers ran — failure of one does NOT cancel the others.
+            self.assertEqual(len(_FakePopen.instances), 3)
+        finally:
+            os.remove(cf)
+
+    def test_respects_max_concurrent_bound(self):
+        """No more than max_concurrent workers are in-flight simultaneously.
+
+        Uses a barrier-free approach: each FakePopen sleeps in wait(); we
+        observe the running max_active count maintained inside FakePopen.
+        Asserts max_active <= max_concurrent regardless of timing — no
+        ordering or wall-clock assertion (which would be flaky under CI load).
+        """
+        clusters = [{"role": f"mesh-{i}", "kubeconfig": f"/k{i}"} for i in range(8)]
+        cf = self._write_clusters(clusters)
+        try:
+            _FakePopen.reset(wait_seconds=0.05)  # 50ms per "CL2 run"
+            with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen):
+                rc = self._call_execute_parallel(cf, max_concurrent=3)
+            self.assertEqual(rc, 0)
+            self.assertEqual(len(_FakePopen.instances), 8)
+            # The bound is the contract: never more than 3 concurrent CL2
+            # docker containers from this orchestrator at once.
+            self.assertLessEqual(_FakePopen.max_active, 3)
+            # Sanity: with 8 work items and 50ms each, we WILL see >1 in
+            # flight — otherwise the test would pass trivially with a
+            # single-threaded executor.
+            self.assertGreater(_FakePopen.max_active, 1)
+        finally:
+            os.remove(cf)
+
+    def test_prefixes_role_but_preserves_vso_service_messages(self):
+        """Worker stdout lines get [role] prefix; ##vso AzDO messages stay verbatim.
+
+        AzDO recognizes ##vso[...] service messages only at column 0 — a
+        [role] prefix would silently drop the structured annotation
+        (warnings, errors, set-variable). Regression-guard: if the prefix
+        logic ever changes, this test breaks loudly.
+        """
+        clusters = [{"role": "mesh-1", "kubeconfig": "/k1"}]
+        cf = self._write_clusters(clusters)
+        try:
+            _FakePopen.reset(
+                wait_seconds=0,
+                role_config={
+                    "mesh-1": ([
+                        "hello world\n",
+                        "##vso[task.logissue type=warning;]something\n",
+                        "more text\n",
+                    ], 0),
+                },
+            )
+            buf = io.StringIO()
+            with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen):
+                with redirect_stdout(buf):
+                    rc = self._call_execute_parallel(cf)
+            self.assertEqual(rc, 0)
+            captured = buf.getvalue()
+            # Non-vso lines are prefixed with [role].
+            self.assertIn("[mesh-1] hello world", captured)
+            self.assertIn("[mesh-1] more text", captured)
+            # vso line MUST NOT be prefixed.
+            self.assertIn("##vso[task.logissue type=warning;]something", captured)
+            self.assertNotIn("[mesh-1] ##vso", captured)
+        finally:
+            os.remove(cf)
+
+    def test_empty_clusters_file_raises(self):
+        """A clusters file with [] is invalid — fail fast, don't silently no-op."""
+        cf = self._write_clusters([])
+        try:
+            with self.assertRaises(ValueError):
+                self._call_execute_parallel(cf)
+        finally:
+            os.remove(cf)
+
+    def test_cluster_missing_kubeconfig_raises(self):
+        """Each cluster object must carry both 'role' and 'kubeconfig'."""
+        cf = self._write_clusters([{"role": "mesh-1"}])
+        try:
+            with self.assertRaises(ValueError):
+                self._call_execute_parallel(cf)
+        finally:
+            os.remove(cf)
+
+    def test_max_concurrent_zero_raises(self):
+        """max_concurrent < 1 is meaningless and would deadlock the executor."""
+        cf = self._write_clusters([{"role": "mesh-1", "kubeconfig": "/k1"}])
+        try:
+            with self.assertRaises(ValueError):
+                self._call_execute_parallel(cf, max_concurrent=0)
+        finally:
+            os.remove(cf)
+
+    def test_extra_fields_in_cluster_object_are_ignored(self):
+        """Pipeline writes name/rg/kubeconfig/role; execute_parallel must tolerate extras.
+
+        Same JSON file is consumed by collect.yml (which uses name/rg/role),
+        so execute_parallel must NOT reject the additional fields.
+        """
+        clusters = [
+            {"role": "mesh-1", "kubeconfig": "/k1", "name": "aks-1", "rg": "rg-1"},
+            {"role": "mesh-2", "kubeconfig": "/k2", "name": "aks-2", "rg": "rg-2"},
+        ]
+        cf = self._write_clusters(clusters)
+        try:
+            _FakePopen.reset(wait_seconds=0)
+            with patch.object(clustermesh_scale_module.subprocess, "Popen", _FakePopen):
+                rc = self._call_execute_parallel(cf)
+            self.assertEqual(rc, 0)
+            self.assertEqual(len(_FakePopen.instances), 2)
+        finally:
+            os.remove(cf)
+
+
+# ============================================================================
+# Phase 4b — Scenario #6 (Upper Bound / Saturation) tests
+# ============================================================================
+
+
+SATURATION_THRESHOLDS = clustermesh_scale_module.SATURATION_THRESHOLDS
+SATURATION_CLASSIFIER_VERSION = clustermesh_scale_module.SATURATION_CLASSIFIER_VERSION
+
+
+def _write_metric_file(report_dir, metric_name, suffix, metrics, fmt="prod", shape="cl2"):
+    """Write a CL2-shaped GenericPrometheusQuery JSON.
+
+    Two AXES of variation:
+
+    **Filename format** (`fmt`):
+      "prod" — build 67211+ production filename format:
+        `GenericPrometheusQuery <metricName with spaces> <suffix>_<group>_<ts>.json`
+      "compact" — legacy/mock filename with no spaces:
+        `GenericPrometheusQuery_<MetricNameNoSpaces><Suffix>_<group>_<ts>.json`
+
+    **Content shape** (`shape`):
+      "cl2" — build 67224 verified — one dataItem with named metric keys
+        in `data`, scalar values:
+          {"dataItems": [{"data": {"Max": 0, "Perc99": 0.5}, "unit": "#"}]}
+      "labels" — legacy / PodStartupLatency-style — one dataItem per
+        metric label, with `data.value` carrying the scalar:
+          {"dataItems": [{"labels": {"Metric": "Perc99"}, "data": {"value": 0.5}}]}
+
+    Defaults to fmt="prod", shape="cl2" — what real CL2 emits today.
+    """
+    if fmt == "prod":
+        fname = (
+            f"GenericPrometheusQuery {metric_name} {suffix}_"
+            f"saturation-test_2026-05-14T00:00:00Z.json"
+        )
+    elif fmt == "compact":
+        compact = metric_name.replace(" ", "")
+        fname = (
+            f"GenericPrometheusQuery_{compact}{suffix}_"
+            f"saturation-test_2026-05-14T00:00:00Z.json"
+        )
+    else:
+        raise ValueError(f"unknown fmt: {fmt!r}")
+    if shape == "cl2":
+        data_items = [{"data": dict(metrics), "unit": "#"}]
+    elif shape == "labels":
+        data_items = [
+            {"labels": {"Metric": label}, "data": {"value": value}}
+            for label, value in metrics.items()
+        ]
+    else:
+        raise ValueError(f"unknown shape: {shape!r}")
+    path = os.path.join(report_dir, fname)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump({"version": "v1", "dataItems": data_items}, f)
+    return path
+
+
+class TestConfigureSaturationKnobs(unittest.TestCase):
+    """Phase 4b — Scenario #6 saturation overrides flow through
+    configure_clusterloader2 and land in the CL2 overrides file with the
+    expected CL2_SATURATION_* keys.
+    """
+
+    def test_saturation_defaults_emitted(self):
+        with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn('CL2_SATURATION_QPS_LIST: "100,500,1500,4000,10000"', content)
+            self.assertIn('CL2_SATURATION_RESTARTS_LIST: "2,4,8,15,25"', content)
+            self.assertIn("CL2_SATURATION_RUNG_DURATION_SECONDS: 240", content)
+            self.assertIn("CL2_SATURATION_SETTLE_SECONDS: 90", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_saturation_overrides_passthrough(self):
+        with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp:
+            tmp_path = tmp.name
+        try:
+            configure_clusterloader2(
+                namespaces=1,
+                deployments_per_namespace=1,
+                replicas_per_deployment=1,
+                operation_timeout="15m",
+                override_file=tmp_path,
+                saturation_qps_list="50,100,200,400,800",
+                saturation_restarts_list="1,1,2,3,5",
+                saturation_rung_duration_seconds=240,
+                saturation_settle_seconds=90,
+            )
+            with open(tmp_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.assertIn('CL2_SATURATION_QPS_LIST: "50,100,200,400,800"', content)
+            self.assertIn('CL2_SATURATION_RESTARTS_LIST: "1,1,2,3,5"', content)
+            self.assertIn("CL2_SATURATION_RUNG_DURATION_SECONDS: 240", content)
+            self.assertIn("CL2_SATURATION_SETTLE_SECONDS: 90", content)
+        finally:
+            os.remove(tmp_path)
+
+    def test_saturation_classifier_constants_exposed(self):
+        """SATURATION_THRESHOLDS + SATURATION_CLASSIFIER_VERSION must be
+        importable so dashboards (and these tests) can reference them. If
+        the schema changes, the version string must change too."""
+        self.assertEqual(SATURATION_CLASSIFIER_VERSION, "saturation-v1")
+        for k in (
+            "latency_p99_ms", "queue_size_perc99", "apiserver_max_cpu_cores",
+            "mesh_failure_rate_max", "etcd_commit_p99_ms",
+        ):
+            self.assertIn(k, SATURATION_THRESHOLDS)
+            self.assertGreater(SATURATION_THRESHOLDS[k], 0)
+
+
+class TestSaturationClassifier(unittest.TestCase):
+    """Phase 4b — Scenario #6 classifier emits per-rung verdicts +
+    per-cluster summary rows. Synthetic per-rung mock data exercises
+    each verdict path.
+    """
+
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+        self.report_dir = os.path.join(self.tmpdir, "mesh-1")
+        shutil.copytree(os.path.join(MOCK_REPORT_ROOT, "mesh-1"), self.report_dir)
+        self.result_file = tempfile.mktemp(suffix=".jsonl")
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
+        if os.path.exists(self.result_file):
+            os.remove(self.result_file)
+
+    def _write_clean_rung(self, rung):
+        suffix = f"Rung{rung}"
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc99": 0.020},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 5, "Perc99": 3},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.01},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc99": 0.005},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc99": 15},
+        )
+
+    def _write_latency_tripped_rung(self, rung):
+        suffix = f"Rung{rung}"
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc99": 0.900},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 10, "Perc99": 5},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"PerPodMax": 0.4, "TotalMax": 0.4, "TotalAvg": 0.3},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.02},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc99": 0.010},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc99": 50},
+        )
+
+    def _write_queue_unbounded_rung(self, rung):
+        suffix = f"Rung{rung}"
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc99": 0.100},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 8000, "Perc99": 5000},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"PerPodMax": 0.5, "TotalMax": 0.5, "TotalAvg": 0.4},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.02},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc99": 0.020},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc99": 200},
+        )
+
+    def _write_cpu_exhaust_rung(self, rung):
+        suffix = f"Rung{rung}"
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc99": 0.200},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 50, "Perc99": 30},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"PerPodMax": 2.5, "TotalMax": 2.5, "TotalAvg": 2.0},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.05},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc99": 0.050},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc99": 80},
+        )
+
+    def _run_collect(self, qps_list, restarts_list=None):
+        if restarts_list is None:
+            restarts_list = ",".join(["1"] * len(qps_list.split(",")))
+        collect_clusterloader2(
+            cl2_report_dir=self.report_dir,
+            cloud_info="",
+            run_id="sat-test",
+            run_url="",
+            result_file=self.result_file,
+            test_type="upper-bound",
+            start_timestamp="2026-05-14T00:00:00Z",
+            cluster_name="mesh-1",
+            cluster_count=2,
+            mesh_size=2,
+            namespaces=5,
+            deployments_per_namespace=4,
+            replicas_per_deployment=10,
+            trigger_reason="Manual",
+            saturation_qps_list=qps_list,
+            saturation_restarts_list=restarts_list,
+        )
+        with open(self.result_file, "r", encoding="utf-8") as f:
+            return [json.loads(l) for l in f.read().strip().split("\n") if l]
+
+    def test_classifier_no_op_when_qps_list_empty(self):
+        """Non-upper-bound runs leave saturation_qps_list empty → no
+        SaturationRung / SaturationSummary rows."""
+        collect_clusterloader2(
+            cl2_report_dir=self.report_dir,
+            cloud_info="",
+            run_id="sat-noop",
+            run_url="",
+            result_file=self.result_file,
+            test_type="event-throughput",
+            start_timestamp="2026-05-14T00:00:00Z",
+            cluster_name="mesh-1",
+            cluster_count=2,
+            mesh_size=2,
+            namespaces=5,
+            deployments_per_namespace=4,
+            replicas_per_deployment=10,
+            trigger_reason="Manual",
+        )
+        with open(self.result_file, "r", encoding="utf-8") as f:
+            lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+        rungs = [r for r in lines if r.get("measurement") == "SaturationRung"]
+        summaries = [r for r in lines if r.get("measurement") == "SaturationSummary"]
+        self.assertEqual(len(rungs), 0)
+        self.assertEqual(len(summaries), 0)
+
+    def test_all_clean_rungs_max_clean_qps_is_highest(self):
+        for r in range(3):
+            self._write_clean_rung(r)
+        lines = self._run_collect("20,40,80")
+        rungs = sorted(
+            [r for r in lines if r.get("measurement") == "SaturationRung"],
+            key=lambda r: r["result"]["data"]["rung_index"],
+        )
+        summary = [r for r in lines if r.get("measurement") == "SaturationSummary"]
+        self.assertEqual(len(rungs), 3)
+        self.assertEqual(len(summary), 1)
+        for r in rungs:
+            self.assertEqual(r["result"]["data"]["verdict"], "clean")
+            self.assertTrue(r["result"]["data"]["rung_completed"])
+            self.assertEqual(r["result"]["data"]["measurement_missing"], [])
+        s = summary[0]["result"]["data"]
+        self.assertEqual(s["max_clean_qps"], 80)
+        self.assertEqual(s["rungs_completed"], 3)
+        self.assertEqual(s["rungs_configured"], 3)
+        self.assertIsNone(s["first_failure_rung_index"])
+        self.assertIsNone(s["first_failure_mode"])
+        self.assertEqual(s["classifier_version"], SATURATION_CLASSIFIER_VERSION)
+
+    def test_latency_spike_verdict(self):
+        self._write_clean_rung(0)
+        self._write_latency_tripped_rung(1)
+        lines = self._run_collect("20,40")
+        rungs = sorted(
+            [r for r in lines if r.get("measurement") == "SaturationRung"],
+            key=lambda r: r["result"]["data"]["rung_index"],
+        )
+        self.assertEqual(rungs[0]["result"]["data"]["verdict"], "clean")
+        self.assertEqual(rungs[1]["result"]["data"]["verdict"], "latency_spike")
+        self.assertAlmostEqual(
+            rungs[1]["result"]["data"]["dominant_signal_ratio"], 1.8, places=2,
+        )
+        summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0]
+        s = summary["result"]["data"]
+        self.assertEqual(s["max_clean_qps"], 20)
+        self.assertEqual(s["first_failure_rung_index"], 1)
+        self.assertEqual(s["first_failure_qps"], 40)
+        self.assertEqual(s["first_failure_mode"], "latency_spike")
+        self.assertIsNone(s["second_failure_mode"])
+
+    def test_queue_unbounded_verdict(self):
+        self._write_clean_rung(0)
+        self._write_queue_unbounded_rung(1)
+        lines = self._run_collect("20,40")
+        rung1 = next(
+            r for r in lines
+            if r.get("measurement") == "SaturationRung"
+            and r["result"]["data"]["rung_index"] == 1
+        )
+        self.assertEqual(rung1["result"]["data"]["verdict"], "queue_unbounded")
+        self.assertAlmostEqual(
+            rung1["result"]["data"]["dominant_signal_ratio"], 5.0, places=2,
+        )
+
+    def test_cpu_exhaust_verdict(self):
+        self._write_clean_rung(0)
+        self._write_cpu_exhaust_rung(1)
+        lines = self._run_collect("20,40")
+        rung1 = next(
+            r for r in lines
+            if r.get("measurement") == "SaturationRung"
+            and r["result"]["data"]["rung_index"] == 1
+        )
+        self.assertEqual(rung1["result"]["data"]["verdict"], "cpu_exhaust")
+        self.assertAlmostEqual(
+            rung1["result"]["data"]["dominant_signal_ratio"], 2.5 / 1.5,
+            places=2,
+        )
+
+    def test_second_failure_mode_tracking(self):
+        """Rung 0 clean, rung 1 latency, rung 2 cpu_exhaust → first=latency_spike,
+        second=cpu_exhaust. Same-mode subsequent failures don't overwrite second."""
+        self._write_clean_rung(0)
+        self._write_latency_tripped_rung(1)
+        self._write_cpu_exhaust_rung(2)
+        lines = self._run_collect("20,40,80")
+        summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0]
+        s = summary["result"]["data"]
+        self.assertEqual(s["first_failure_mode"], "latency_spike")
+        self.assertEqual(s["second_failure_mode"], "cpu_exhaust")
+        self.assertEqual(s["first_failure_qps"], 40)
+
+    def test_max_clean_qps_is_contiguous_prefix(self):
+        """If a non-clean rung lands then a later 'clean' rung shows up,
+        max_clean_qps does NOT extend past the first failure."""
+        self._write_clean_rung(0)
+        self._write_clean_rung(1)
+        self._write_latency_tripped_rung(2)
+        self._write_clean_rung(3)
+        lines = self._run_collect("20,40,80,160")
+        summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0]
+        s = summary["result"]["data"]
+        self.assertEqual(s["max_clean_qps"], 40)
+        self.assertEqual(s["first_failure_rung_index"], 2)
+        self.assertEqual(s["first_failure_mode"], "latency_spike")
+
+    def test_missing_measurements_flag_incomplete_rung(self):
+        """If a rung's measurement files are missing, measurement_missing
+        lists the gaps. Latency present → rung_completed still true."""
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            "Rung0", {"Perc99": 0.020},
+        )
+        lines = self._run_collect("20")
+        rung = next(r for r in lines if r.get("measurement") == "SaturationRung")
+        d = rung["result"]["data"]
+        self.assertTrue(d["rung_completed"])
+        self.assertIn("queue_size_perc99", d["measurement_missing"])
+        self.assertIn("apiserver_max_cpu_cores", d["measurement_missing"])
+        self.assertIn("mesh_failure_rate_max", d["measurement_missing"])
+        self.assertIn("etcd_commit_p99_ms", d["measurement_missing"])
+
+    def test_rung_completed_false_when_latency_missing(self):
+        """Latency is the gating signal — without it, rung is incomplete
+        regardless of how many other signals landed."""
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            "Rung0", {"Max": 5, "Perc99": 3},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            "Rung0", {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            "Rung0", {"Max": 0.01},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            "Rung0", {"Perc99": 0.005},
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            "Rung0", {"Perc99": 15},
+        )
+        lines = self._run_collect("20")
+        rung = next(r for r in lines if r.get("measurement") == "SaturationRung")
+        self.assertFalse(rung["result"]["data"]["rung_completed"])
+        self.assertIn("latency_p99_ms", rung["result"]["data"]["measurement_missing"])
+        summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0]
+        self.assertEqual(summary["result"]["data"]["rungs_completed"], 0)
+
+    def test_summary_carries_classifier_metadata(self):
+        """SaturationSummary records classifier_version + thresholds so
+        dashboards can recompute verdicts post-hoc."""
+        self._write_clean_rung(0)
+        lines = self._run_collect("20")
+        summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0]
+        s = summary["result"]["data"]
+        self.assertEqual(s["classifier_version"], SATURATION_CLASSIFIER_VERSION)
+        self.assertEqual(s["thresholds"], SATURATION_THRESHOLDS)
+        self.assertEqual(s["configured_qps_list"], [20])
+        self.assertEqual(s["configured_restarts_list"], [1])
+
+    def test_rung_row_carries_raw_signal_values(self):
+        """SaturationRung records raw signal values + all per-criterion
+        ratios so the classifier can be re-run post-hoc at different
+        thresholds without re-collecting from CL2."""
+        self._write_latency_tripped_rung(0)
+        lines = self._run_collect("20")
+        rung = next(r for r in lines if r.get("measurement") == "SaturationRung")
+        d = rung["result"]["data"]
+        self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 900.0, places=1)
+        self.assertAlmostEqual(d["signals"]["apiserver_max_cpu_cores"], 0.4, places=2)
+        self.assertIn("latency_spike", d["all_verdicts"])
+        self.assertIn("cpu_exhaust", d["all_verdicts"])
+
+    def test_malformed_qps_list_skips_classifier_gracefully(self):
+        """Malformed CL2_SATURATION_QPS_LIST should not crash collect; the
+        classifier logs a warning and emits zero saturation rows."""
+        self._write_latency_tripped_rung(0)
+        collect_clusterloader2(
+            cl2_report_dir=self.report_dir,
+            cloud_info="",
+            run_id="sat-malformed",
+            run_url="",
+            result_file=self.result_file,
+            test_type="upper-bound",
+            start_timestamp="2026-05-14T00:00:00Z",
+            cluster_name="mesh-1",
+            cluster_count=2,
+            mesh_size=2,
+            namespaces=5,
+            deployments_per_namespace=4,
+            replicas_per_deployment=10,
+            trigger_reason="Manual",
+            saturation_qps_list="20,not-a-number,80",
+            saturation_restarts_list="1,2,3",
+        )
+        with open(self.result_file, "r", encoding="utf-8") as f:
+            lines = [json.loads(l) for l in f.read().strip().split("\n") if l]
+        rungs = [r for r in lines if r.get("measurement") == "SaturationRung"]
+        summaries = [r for r in lines if r.get("measurement") == "SaturationSummary"]
+        self.assertEqual(len(rungs), 0)
+        self.assertEqual(len(summaries), 0)
+
+    def test_restarts_list_padded_when_shorter_than_qps(self):
+        """If restarts_list is shorter than qps_list, missing entries
+        default to 1 so the classifier doesn't crash."""
+        self._write_clean_rung(0)
+        self._write_clean_rung(1)
+        self._write_clean_rung(2)
+        lines = self._run_collect("20,40,80", restarts_list="1,2")
+        rungs = sorted(
+            [r for r in lines if r.get("measurement") == "SaturationRung"],
+            key=lambda r: r["result"]["data"]["rung_index"],
+        )
+        self.assertEqual(rungs[0]["result"]["data"]["configured_restarts"], 1)
+        self.assertEqual(rungs[1]["result"]["data"]["configured_restarts"], 2)
+        self.assertEqual(rungs[2]["result"]["data"]["configured_restarts"], 1)
+
+    def test_monitoring_oom_verdict_when_prom_dies_mid_run(self):
+        """Phase 4b — Scenario #6 monitoring_oom verdict (added 2026-05-15
+        after build 67279). When an earlier rung successfully completed but
+        a later rung has zero signals, the most likely explanation is the
+        Prometheus stack OOM'ed under load. That IS a saturation finding
+        per spec line 113 ('Resource exhaustion occurs') so we record it
+        as verdict=monitoring_oom rather than silently leaving it as
+        verdict=clean rung_completed=False (which underclaims the failure).
+        """
+        # Rung 0: clean (Prom alive, all signals land)
+        self._write_clean_rung(0)
+        # Rung 1: NOTHING — Prom crashed mid-run before its gather phase
+        # (no files written for this rung). Classifier should detect
+        # "previous rung had signals, this one doesn't → monitoring_oom".
+        lines = self._run_collect("20,40")
+        rungs = sorted(
+            [r for r in lines if r.get("measurement") == "SaturationRung"],
+            key=lambda r: r["result"]["data"]["rung_index"],
+        )
+        self.assertEqual(rungs[0]["result"]["data"]["verdict"], "clean")
+        self.assertEqual(rungs[1]["result"]["data"]["verdict"], "monitoring_oom")
+        self.assertEqual(rungs[1]["result"]["data"]["dominant_signal_ratio"], 999.0)
+        self.assertFalse(rungs[1]["result"]["data"]["rung_completed"])
+        # Summary records monitoring_oom as the first failure mode.
+        summary = [r for r in lines if r.get("measurement") == "SaturationSummary"][0]
+        s = summary["result"]["data"]
+        self.assertEqual(s["max_clean_qps"], 20)
+        self.assertEqual(s["first_failure_mode"], "monitoring_oom")
+        self.assertEqual(s["first_failure_qps"], 40)
+
+    def test_monitoring_oom_not_emitted_when_no_prior_rung_completed(self):
+        """If even Rung 0 has zero signals, that's NOT monitoring_oom —
+        it's an upstream config / deployment problem (Prom never came up,
+        or scale.py was misconfigured). Stay at verdict=clean
+        rung_completed=False so postmortem investigates the right layer."""
+        # Don't write any files. Every rung will have zero signals.
+        lines = self._run_collect("20,40")
+        rungs = sorted(
+            [r for r in lines if r.get("measurement") == "SaturationRung"],
+            key=lambda r: r["result"]["data"]["rung_index"],
+        )
+        # Both rungs should be clean (not monitoring_oom) because no
+        # earlier rung established that Prom WAS working.
+        for r in rungs:
+            self.assertNotEqual(r["result"]["data"]["verdict"], "monitoring_oom",
+                                f"rung {r['result']['data']['rung_index']}: "
+                                f"monitoring_oom should only fire after a "
+                                f"prior rung completed")
+            self.assertEqual(r["result"]["data"]["verdict"], "clean")
+            self.assertFalse(r["result"]["data"]["rung_completed"])
+
+    def test_classifier_matches_build_67211_production_filename_format(self):
+        """REGRESSION: build 67211 (first n=2 upper-bound smoke 2026-05-14)
+        emitted measurement files in the format
+            'GenericPrometheusQuery <metricName with spaces> <suffix>_<group>_<ts>.json'
+        but the classifier was matching the legacy compact format
+            'GenericPrometheusQuery_<MetricNameNoSpaces><Suffix>_<group>_<ts>.json'
+        → 0 files found, all 4 rungs classified as `clean` with 0 signals
+        despite all 20 signal files (5 signals × 4 rungs) being present on
+        disk. This test pins the production format so a future regression
+        fails locally instead of silently in CI.
+        """
+        # Use fmt="prod" — production format with spaces. Default in
+        # _write_metric_file is also "prod" but explicit here for clarity.
+        suffix = "Rung0"
+        # Latency: 600ms p99 (above 500ms threshold) → should trip latency_spike
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc99": 0.600}, fmt="prod",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 50, "Perc99": 30}, fmt="prod",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"PerPodMax": 0.5, "TotalMax": 0.5, "TotalAvg": 0.4},
+            fmt="prod",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.05}, fmt="prod",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc99": 0.020}, fmt="prod",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc99": 30}, fmt="prod",
+        )
+        # Verify the file on disk matches the build-67211 pattern exactly.
+        on_disk = sorted(os.listdir(self.report_dir))
+        prod_pattern_files = [
+            f for f in on_disk
+            if f.startswith("GenericPrometheusQuery ClusterMesh ")
+            and "Rung0_" in f
+        ]
+        self.assertGreaterEqual(
+            len(prod_pattern_files), 6,
+            f"production-format files not on disk; got: {prod_pattern_files}",
+        )
+        lines = self._run_collect("20")
+        rung = next(r for r in lines if r.get("measurement") == "SaturationRung")
+        d = rung["result"]["data"]
+        # Classifier must FIND the files (production format) and apply the
+        # verdict. Pre-fix: all signals would be `None`, verdict=`clean`,
+        # rung_completed=False. Post-fix: latency value lands → latency_spike.
+        self.assertTrue(d["rung_completed"],
+                        f"rung must be completed; missing={d['measurement_missing']}")
+        self.assertEqual(d["measurement_missing"], [],
+                         f"all 7 signals should land; missing={d['measurement_missing']}")
+        self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 600.0, places=1)
+        self.assertEqual(d["verdict"], "latency_spike")
+
+    def test_classifier_accepts_legacy_compact_filename_format(self):
+        """The classifier supports BOTH production (space) and legacy
+        (compact-underscore) filename formats so test mocks/older CL2
+        emissions don't silently fail. Pin both with this test."""
+        suffix = "Rung0"
+        # Write the same set in COMPACT format (no spaces, underscore after
+        # GenericPrometheusQuery).
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc99": 0.020}, fmt="compact",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 5, "Perc99": 3}, fmt="compact",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2},
+            fmt="compact",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.01}, fmt="compact",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc99": 0.005}, fmt="compact",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc99": 15}, fmt="compact",
+        )
+        lines = self._run_collect("20")
+        rung = next(r for r in lines if r.get("measurement") == "SaturationRung")
+        d = rung["result"]["data"]
+        self.assertTrue(d["rung_completed"])
+        self.assertEqual(d["verdict"], "clean")
+        self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 20.0, places=1)
+
+    def test_classifier_reads_build_67224_cl2_content_shape(self):
+        """REGRESSION: build 67224 (2nd n=2 upper-bound smoke 2026-05-15)
+        emitted measurement file content in the CL2 GenericPrometheusQuery
+        shape — one dataItem with query results as named keys in `data`:
+            {"dataItems": [{"data": {"Max": 0, "Perc99": 0.5}, "unit": "#"}]}
+        not the legacy labels shape
+            {"dataItems": [{"labels": {"Metric": "Perc99"}, "data": {"value": 0.5}}]}
+        The classifier was reading via labels.Metric, missing every value.
+        Pin BOTH content shapes here so the bug can't regress.
+        """
+        # shape="cl2" mirrors the actual on-disk content from build 67224.
+        suffix = "Rung0"
+        # Latency 600ms p99 (above 500ms threshold) → should trip latency_spike
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc50": 0.020, "Perc90": 0.300, "Perc99": 0.600},
+            fmt="prod", shape="cl2",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 50, "Perc50": 10, "Perc99": 30},
+            fmt="prod", shape="cl2",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"TotalMax": 0.5, "TotalAvg": 0.3, "PerPodMax": 0.5},
+            fmt="prod", shape="cl2",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.05, "Perc50": 0.01},
+            fmt="prod", shape="cl2",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc50": 0.003, "Perc90": 0.005, "Perc99": 0.020},
+            fmt="prod", shape="cl2",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc50": 0, "Perc90": 5, "Perc99": 30, "TotalIncrease": 3000},
+            fmt="prod", shape="cl2",
+        )
+        lines = self._run_collect("20")
+        rung = next(r for r in lines if r.get("measurement") == "SaturationRung")
+        d = rung["result"]["data"]
+        # Pre-fix (build 67224): all signals returned None → verdict=clean
+        # rung_completed=False signals_found=0/7. Post-fix: every signal
+        # lands, latency trips threshold.
+        self.assertTrue(d["rung_completed"],
+                        f"rung must be completed; missing={d['measurement_missing']}")
+        self.assertEqual(d["measurement_missing"], [],
+                         f"all 7 signals should land; missing={d['measurement_missing']}")
+        self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 600.0, places=1)
+        self.assertAlmostEqual(d["signals"]["queue_size_perc99"], 30.0, places=1)
+        self.assertAlmostEqual(d["signals"]["apiserver_max_cpu_cores"], 0.5, places=2)
+        self.assertAlmostEqual(d["signals"]["mesh_failure_rate_max"], 0.05, places=3)
+        self.assertEqual(d["verdict"], "latency_spike")
+
+    def test_classifier_reads_legacy_labels_content_shape(self):
+        """Backward-compat: even though build 67224 uses the cl2 shape,
+        legacy mocks (and PodStartupLatency-format files) use a
+        per-metric-labels shape. The classifier must still read those so
+        existing mock fixtures don't break."""
+        suffix = "Rung0"
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Operation Duration",
+            suffix, {"Perc99": 0.020}, fmt="prod", shape="labels",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Sync Queue Size",
+            suffix, {"Max": 5, "Perc99": 3}, fmt="prod", shape="labels",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh APIServer Pod CPU",
+            suffix, {"PerPodMax": 0.3, "TotalMax": 0.3, "TotalAvg": 0.2},
+            fmt="prod", shape="labels",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Remote Cluster Failure Rate",
+            suffix, {"Max": 0.01}, fmt="prod", shape="labels",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Etcd Backend Write Duration",
+            suffix, {"Perc99": 0.005}, fmt="prod", shape="labels",
+        )
+        _write_metric_file(
+            self.report_dir, "ClusterMesh Kvstore Events Rate",
+            suffix, {"Perc99": 15}, fmt="prod", shape="labels",
         )
+        lines = self._run_collect("20")
+        rung = next(r for r in lines if r.get("measurement") == "SaturationRung")
+        d = rung["result"]["data"]
+        self.assertTrue(d["rung_completed"])
+        self.assertEqual(d["verdict"], "clean")
+        self.assertAlmostEqual(d["signals"]["latency_p99_ms"], 20.0, places=1)
+        self.assertAlmostEqual(d["signals"]["queue_size_perc99"], 3.0, places=1)
 
 
 if __name__ == "__main__":
diff --git a/modules/terraform/azure/aks-cli/main.tf b/modules/terraform/azure/aks-cli/main.tf
index 687ca04e5b..2cf3016845 100644
--- a/modules/terraform/azure/aks-cli/main.tf
+++ b/modules/terraform/azure/aks-cli/main.tf
@@ -11,6 +11,41 @@ locals {
     pool.name => pool
   }
 
+  # Pre-built `az aks nodepool add` command per extra pool. Pulled into a
+  # local so the terraform_data.aks_nodepool_cli heredoc body stays readable
+  # (avoids a multi-line interpolation inside the bash retry-loop heredoc,
+  # which `terraform fmt` otherwise mangles).
+  extra_pool_commands = {
+    for pool in var.aks_cli_config.extra_node_pool : pool.name => join(" ", [
+      "az",
+      "aks",
+      "nodepool",
+      "add",
+      "-g", var.resource_group_name,
+      "--cluster-name", var.aks_cli_config.aks_name,
+      "--nodepool-name", pool.name,
+      "--node-count", pool.node_count,
+      "--node-vm-size", pool.vm_size,
+      "--vm-set-type", pool.vm_set_type,
+      "--node-osdisk-type", pool.os_disk_type,
+      local.aks_custom_headers_flags,
+      # If the default pool uses --pod-subnet-id (Azure CNI dynamic IP
+      # allocation), AKS requires ALL agent pools to set it (or none).
+      # Without this, `az aks nodepool add` on extra pools fails with
+      # `InvalidParameter: All or none of the agentpools should set
+      # podsubnet`. Reuse the same pod subnet as the default pool — extra
+      # pools (e.g. prompool) host non-workload pods so the per-pool pod
+      # IP separation isn't meaningful here.
+      local.pod_subnet_id_parameter,
+      length(pool.optional_parameters) == 0 ?
+      "" :
+      join(" ", [
+        for param in pool.optional_parameters :
+        format("--%s %s", param.name, param.value)
+      ]),
+    ])
+  }
+
   key_management_service = (
     var.aks_cli_config.kms_config != null
     ) ? {
@@ -333,34 +368,111 @@ resource "terraform_data" "aks_cli" {
   }
 }
 
+# Gate any subsequent `az aks ...` operations (extra node pools, post-create
+# updates) on the cluster reaching a stable provisioningState=Succeeded.
+#
+# Why this exists: `az aks create --enable-acns` (and similar addon flags
+# like --enable-azure-monitor-metrics) kicks off a PutExtensionAddonHandler
+# PUT operation that runs ASYNCHRONOUSLY after `az aks create` returns. While
+# that operation is in flight, any downstream `az aks nodepool add` (e.g. our
+# extra_node_pool / prompool) fails with:
+#   ERROR: (OperationNotAllowed) Operation is not allowed because there's an
+#   in progress PutExtensionAddonHandler.PUT operation ... Please wait for it
+#   to finish before starting a new operation.
+# The race is timing-dependent and rarely manifests with 1-2 concurrent
+# cluster creates, but is deterministic at N>=5 (regional AKS RP queues the
+# extension installs and the slowest cluster's PUT lags `az aks create` return
+# by several minutes — observed in the clustermesh-scale n5 tier).
+#
+# Polling logic: require 3 consecutive Succeeded readings 20s apart, with a
+# 60s initial buffer so any queued extension install has time to transition
+# the cluster into Updating. The consecutive requirement defends against the
+# brief Succeeded window between create-finish and extension-start. Total
+# budget ~20m.
+resource "terraform_data" "aks_wait_succeeded" {
+  count = var.aks_cli_config.dry_run ? 0 : 1
+
+  depends_on = [terraform_data.aks_cli]
+
+  input = {
+    resource_group_name = var.resource_group_name
+    aks_name            = var.aks_cli_config.aks_name
+  }
+
+  provisioner "local-exec" {
+    # local-exec defaults to /bin/sh which on Ubuntu agents is dash; dash
+    # rejects `set -o pipefail` (bash-only). Explicitly select bash so the
+    # script's safety options work as written.
+    interpreter = ["bash", "-c"]
+    command     = <<-EOT
+      set -eo pipefail
+      rg="${self.input.resource_group_name}"
+      name="${self.input.aks_name}"
+      echo "Waiting for AKS $name to reach a stable Succeeded state..."
+      sleep 60
+      required=3
+      got=0
+      for i in $(seq 1 60); do
+        state=$(az aks show -g "$rg" -n "$name" --query provisioningState -o tsv 2>/dev/null || echo "Unknown")
+        if [ "$state" = "Succeeded" ]; then
+          got=$((got + 1))
+          if [ "$got" -ge "$required" ]; then
+            echo "AKS $name stable in Succeeded ($got consecutive checks). Continuing."
+            exit 0
+          fi
+        else
+          if [ "$got" -gt 0 ]; then
+            echo "AKS $name re-entered '$state' after Succeeded streak; resetting counter"
+          fi
+          got=0
+        fi
+        echo "AKS $name provisioningState=$state (Succeeded streak=$got/$required)"
+        sleep 20
+      done
+      echo "Timeout: AKS $name did not reach sustained Succeeded after ~20m"
+      exit 1
+    EOT
+  }
+}
+
 resource "terraform_data" "aks_nodepool_cli" {
   depends_on = [
-    terraform_data.aks_cli
+    terraform_data.aks_cli,
+    terraform_data.aks_wait_succeeded,
   ]
 
   for_each = local.extra_pool_map
 
+  # Wrap the underlying `az aks nodepool add` (built in locals.extra_pool_commands)
+  # in a bash retry loop that handles the OperationNotAllowed / AnotherOperationInProgress
+  # AKS RP race window. Even with terraform_data.aks_wait_succeeded gating
+  # this on a stable cluster Succeeded state, the AKS RP can lazily start
+  # post-create extension PUTs (e.g. --enable-acns) AFTER the wait exits —
+  # observed at N>=5 cluster create concurrency where the regional RP queues
+  # addon installs minutes behind the parent cluster create. The retry catches
+  # that race; keeping the wait avoids noisy first-attempt failures in the
+  # common (non-lazy) case. 30 retries * 30s = 15min budget.
   provisioner "local-exec" {
-    command = join(" ", [
-      "az",
-      "aks",
-      "nodepool",
-      "add",
-      "-g", var.resource_group_name,
-      "--cluster-name", var.aks_cli_config.aks_name,
-      "--nodepool-name", each.value.name,
-      "--node-count", each.value.node_count,
-      "--node-vm-size", each.value.vm_size,
-      "--vm-set-type", each.value.vm_set_type,
-      "--node-osdisk-type", each.value.os_disk_type,
-      local.aks_custom_headers_flags,
-      length(each.value.optional_parameters) == 0 ?
-      "" :
-      join(" ", [
-        for param in each.value.optional_parameters :
-        format("--%s %s", param.name, param.value)
-      ]),
-    ])
+    interpreter = ["bash", "-c"]
+    command     = <<-EOT
+      set -eo pipefail
+      cmd=${jsonencode(local.extra_pool_commands[each.key])}
+      pool="${each.value.name}"
+      cluster="${var.aks_cli_config.aks_name}"
+      for i in $(seq 1 30); do
+        out=$(eval "$cmd" 2>&1) && { echo "$out"; exit 0; }
+        if echo "$out" | grep -qE "OperationNotAllowed|AnotherOperationInProgress"; then
+          echo "[retry $i/30] $cluster nodepool $pool create blocked by in-progress AKS RP operation; sleeping 30s"
+          sleep 30
+          continue
+        fi
+        # Some other failure (quota, invalid args, etc.) — fail fast.
+        echo "$out" >&2
+        exit 1
+      done
+      echo "Timeout: $cluster nodepool $pool create still blocked after 30 retries (~15m)" >&2
+      exit 1
+    EOT
   }
 }
 
diff --git a/modules/terraform/azure/aks-cli/variables.tf b/modules/terraform/azure/aks-cli/variables.tf
index 3fb1c427f1..2a0384c03b 100644
--- a/modules/terraform/azure/aks-cli/variables.tf
+++ b/modules/terraform/azure/aks-cli/variables.tf
@@ -73,10 +73,20 @@ variable "bootstrap_container_registry_resource_id" {
 
 variable "aks_cli_config" {
   type = object({
-    role                              = string
-    aks_name                          = string
-    sku_tier                          = string
-    subnet_name                       = optional(string, null)
+    role        = string
+    aks_name    = string
+    sku_tier    = string
+    subnet_name = optional(string, null)
+    # Pod subnet for Azure CNI dynamic IP allocation (--pod-subnet-id).
+    # When set, AKS pulls pod IPs from this subnet instead of co-tenanting
+    # them on the node subnet (legacy CNI). Required at scale since legacy
+    # mode pre-allocates `1 + max-pods` IPs per node on the node subnet —
+    # at 20 nodes × max-pods=110 that's 2,220 IPs, vastly exceeding a typical
+    # /24 node subnet. The aks-cli main.tf reads this via local.pod_subnet_id
+    # and emits --pod-subnet-id when non-null. Originally referenced in
+    # main.tf without being declared here — silently fell back to legacy
+    # CNI for ALL callers regardless of tfvars. Added 2026-05-09.
+    pod_subnet_name                   = optional(string, null)
     managed_identity_name             = optional(string, null)
     kubernetes_version                = optional(string, null)
     aks_custom_headers                = optional(list(string), [])
diff --git a/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml
index caaedc0ea0..e7dabb189f 100644
--- a/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml	
+++ b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml	
@@ -59,6 +59,163 @@ stages:
               restart_count: 1
               api_server_calls_per_second: 20
               trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4a — Scenario #2 (Pod Churn Stress).
+            # Each matrix entry runs the full provision → execute → destroy
+            # lifecycle independently (matrix entries do NOT share Fleet/RG);
+            # enable selectively in the AzDO UI to control per-run cost.
+            n2_pod_churn_scale:
+              cluster_count: 2
+              mesh_size: 2
+              cl2_config_file: pod-churn-scale.yaml
+              test_type: pod-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              # 5 cycles × (60s up + 60s down) ≈ 10 min sustained churn —
+              # spec line 67 "CPU/memory growth over time" measurement window.
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n2_pod_churn_kill:
+              cluster_count: 2
+              mesh_size: 2
+              cl2_config_file: pod-churn-kill.yaml
+              test_type: pod-churn-kill
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              # In-cluster killer Job loops for kill_duration_seconds, deleting
+              # kill_batch random workload pods every kill_interval_seconds.
+              # kill_job_deadline_seconds is the Job activeDeadlineSeconds —
+              # defense-in-depth bound; must exceed kill_duration_seconds.
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #3 (Node Churn / IP Churn). The stimulus
+            # (az aks nodepool scale / VMSS instance delete) runs OUTSIDE
+            # CL2 from steps/engine/clusterloader2/clustermesh-scale/execute.yml
+            # in a background subshell; CL2 deploys a baseline workload on
+            # every cluster and observes via measurements (node-churn.yaml).
+            # See modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh
+            # for the script header. mesh_size-wide concurrency override
+            # forced in execute.yml (needs_mesh_wide_concurrency).
+            n2_node_churn_scale:
+              cluster_count: 2
+              mesh_size: 2
+              cl2_config_file: node-churn-scale.yaml
+              test_type: node-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              # Node-churn knobs — see scale.py configure for semantics. Defaults
+              # in execute.yml fill in when matrix entry omits them, but we set
+              # them explicitly for traceability.
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_scale_duration_seconds: 1800
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n2_node_churn_replace:
+              cluster_count: 2
+              mesh_size: 2
+              cl2_config_file: node-churn-replace.yaml
+              test_type: node-churn-replace
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_settle_seconds: 60
+              node_churn_replace_duration_seconds: 1500
+              # node_replace_batch_size: 10 default; bounded above by original
+              # pool size (20) so 10 = 50%% replacement is the sweet spot for
+              # mesh propagation pressure without saturating Cilium endpoint
+              # reconcile under our DSv3 budget.
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n2_node_churn_combined:
+              cluster_count: 2
+              mesh_size: 2
+              cl2_config_file: node-churn-combined.yaml
+              test_type: node-churn-combined
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_combined_duration_seconds: 3300
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #6 (Upper Bound / Saturation Testing).
+            # In-run rung loop sweeps QPS across the configured list; each
+            # rung restart-bursts the workload at that QPS for
+            # saturation_rung_duration_seconds. scale.py collect's
+            # classifier tags each rung with the dominant signal
+            # (clean | latency_spike | queue_unbounded | cpu_exhaust |
+            # mesh_failure_burst | etcd_tail) — see SATURATION_THRESHOLDS
+            # in scale.py + plan.md Scenario #6 section.
+            #
+            # Mesh-wide concurrency forced in execute.yml
+            # (needs_mesh_wide_concurrency) so every cluster's CL2 runs
+            # simultaneously — per-cluster saturation point is meaningless
+            # if peers aren't also loaded.
+            #
+            # NOT share-infra-eligible in v1: a tripped rung can leave
+            # queue/memory residue that would contaminate following
+            # scenarios. Standalone matrix entry only until baseline data
+            # justifies share-infra positioning.
+            n2_upper_bound:
+              cluster_count: 2
+              mesh_size: 2
+              cl2_config_file: upper-bound.yaml
+              test_type: upper-bound
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              # Baseline QPS used by the workload-create phase (first rung's
+              # QPS, gentle). Per-rung QPS comes from saturation_qps_list.
+              api_server_calls_per_second: 20
+              # 4-rung sweep. n=2 smoke uses smaller-amplitude defaults so
+              # the first run doesn't trip Azure-side limits before the
+              # classifier thresholds have been calibrated. Bump for prod
+              # after first n=2 + n=20 greens.
+              saturation_qps_list: "100,500,1500,4000,10000"
+              saturation_restarts_list: "2,4,8,15,25"
+              saturation_rung_duration_seconds: 240
+              saturation_settle_seconds: 90
+              trigger_reason: ${{ variables['Build.Reason'] }}
           max_parallel: 1
           timeout_in_minutes: 120
           credential_type: service_connection
@@ -66,4 +223,508 @@ stages:
           # Iteration-only: skip uploading results to the telescope blob while
           # we're still stabilizing the clustermesh-scale pipeline. Flip to
           # false (or remove) once results are meaningful.
-          skip_publish: true
+          skip_publish: false
+
+  # Phase 3 — 5-cluster tier. Separate stage (not a matrix entry) because
+  # `terraform_input_file_mapping` is set at the job level, so different
+  # cluster counts require different stages bound to different tfvars files.
+  - stage: azure_eastus2euap_n5
+    dependsOn: []
+    jobs:
+      - template: /jobs/competitive-test.yml
+        parameters:
+          cloud: azure
+          regions:
+            - eastus2euap
+          engine: clusterloader2
+          engine_input:
+            image: "ghcr.io/azure/clusterloader2:v20250513"
+            install: false
+            operation_timeout: 15m
+          topology: clustermesh-scale
+          terraform_input_file_mapping:
+            - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars"
+          matrix:
+            n5_event_throughput:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: event-throughput.yaml
+              test_type: event-throughput
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 1
+              api_server_calls_per_second: 20
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4a — Scenario #2 (Pod Churn Stress).
+            n5_pod_churn_scale:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: pod-churn-scale.yaml
+              test_type: pod-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n5_pod_churn_kill:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: pod-churn-kill.yaml
+              test_type: pod-churn-kill
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #3 (Node Churn / IP Churn). See n2 entry
+            # for the full design rationale; only mesh_size differs at this tier.
+            n5_node_churn_scale:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: node-churn-scale.yaml
+              test_type: node-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_scale_duration_seconds: 1800
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n5_node_churn_replace:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: node-churn-replace.yaml
+              test_type: node-churn-replace
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_settle_seconds: 60
+              node_churn_replace_duration_seconds: 1500
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n5_node_churn_combined:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: node-churn-combined.yaml
+              test_type: node-churn-combined
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_combined_duration_seconds: 3300
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #6 (Upper Bound / Saturation). See n2
+            # entry for the full design rationale; only mesh_size differs
+            # at this tier. Same QPS sweep at every tier so the per-tier
+            # saturation point is directly comparable across clusters axis.
+            n5_upper_bound:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: upper-bound.yaml
+              test_type: upper-bound
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              saturation_qps_list: "100,500,1500,4000,10000"
+              saturation_restarts_list: "2,4,8,15,25"
+              saturation_rung_duration_seconds: 240
+              saturation_settle_seconds: 90
+              trigger_reason: ${{ variables['Build.Reason'] }}
+          max_parallel: 1
+          # 5-cluster provision adds ~10-15 min vs n2 (more terraform + fleet
+          # member creates + RBAC propagation); CL2 fan-out itself stays
+          # bounded at concurrency 4 so per-cluster wall-clock is unchanged.
+          timeout_in_minutes: 180
+          credential_type: service_connection
+          ssh_key_enabled: false
+          skip_publish: false
+
+  # Phase 3 — 10-cluster tier. Per-cluster sizing identical to n2/n5;
+  # only mesh size scales. Quota footprint per run: ~120 vCPU
+  # (10x default-pool D4s_v5 + 10x prompool D8s_v3). 90 VNet peerings.
+  - stage: azure_eastus2euap_n10
+    dependsOn: []
+    # See dev pipeline (pipelines/system/new-pipeline-test.yml) for the
+    # full rationale on TF_CLI_ARGS_apply=-parallelism=4: at default
+    # parallelism=10 the regional AKS RP throttles severely on 10
+    # simultaneous `az aks create` calls.
+    variables:
+      TF_CLI_ARGS_apply: "-parallelism=4"
+    jobs:
+      - template: /jobs/competitive-test.yml
+        parameters:
+          cloud: azure
+          regions:
+            - eastus2euap
+          engine: clusterloader2
+          engine_input:
+            image: "ghcr.io/azure/clusterloader2:v20250513"
+            install: false
+            operation_timeout: 15m
+          topology: clustermesh-scale
+          terraform_input_file_mapping:
+            - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars"
+          matrix:
+            n10_event_throughput:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: event-throughput.yaml
+              test_type: event-throughput
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 1
+              api_server_calls_per_second: 20
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4a — Scenario #2 (Pod Churn Stress).
+            n10_pod_churn_scale:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: pod-churn-scale.yaml
+              test_type: pod-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n10_pod_churn_kill:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: pod-churn-kill.yaml
+              test_type: pod-churn-kill
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #3 (Node Churn / IP Churn).
+            n10_node_churn_scale:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: node-churn-scale.yaml
+              test_type: node-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_scale_duration_seconds: 1800
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n10_node_churn_replace:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: node-churn-replace.yaml
+              test_type: node-churn-replace
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_settle_seconds: 60
+              node_churn_replace_duration_seconds: 1500
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n10_node_churn_combined:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: node-churn-combined.yaml
+              test_type: node-churn-combined
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_combined_duration_seconds: 3300
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #6 (Upper Bound / Saturation) at n=10.
+            n10_upper_bound:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: upper-bound.yaml
+              test_type: upper-bound
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              saturation_qps_list: "100,500,1500,4000,10000"
+              saturation_restarts_list: "2,4,8,15,25"
+              saturation_rung_duration_seconds: 240
+              saturation_settle_seconds: 90
+              trigger_reason: ${{ variables['Build.Reason'] }}
+          max_parallel: 1
+          # 10-cluster provision adds ~10-15 min vs n5 (more terraform +
+          # fleet member creates + ARM throughput); CL2 fan-out itself
+          # stays bounded at concurrency 4 (10/4 batches sequentially).
+          timeout_in_minutes: 240
+          credential_type: service_connection
+          ssh_key_enabled: false
+          skip_publish: false
+
+  # Phase 3 — 20-cluster tier (final scale-test point per spec line 25).
+  # Per-cluster sizing identical to lower tiers; only mesh size scales.
+  # Quota footprint: ~320 vCPU (20x D4s_v5 + 20x D8s_v3). 380 VNet peerings.
+  # See dev pipeline n20 stage for full rationale on TF_CLI_ARGS_apply.
+  - stage: azure_eastus2euap_n20
+    dependsOn: []
+    variables:
+      TF_CLI_ARGS_apply: "-parallelism=8"
+    jobs:
+      - template: /jobs/competitive-test.yml
+        parameters:
+          cloud: azure
+          regions:
+            - eastus2euap
+          engine: clusterloader2
+          engine_input:
+            image: "ghcr.io/azure/clusterloader2:v20250513"
+            install: false
+            operation_timeout: 15m
+          topology: clustermesh-scale
+          terraform_input_file_mapping:
+            - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars"
+          matrix:
+            n20_event_throughput:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: event-throughput.yaml
+              test_type: event-throughput
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 1
+              api_server_calls_per_second: 20
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4a — Scenario #2 (Pod Churn Stress). Each entry is a
+            # separate full lifecycle (~6h at n20). Enable selectively.
+            n20_pod_churn_scale:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: pod-churn-scale.yaml
+              test_type: pod-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n20_pod_churn_kill:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: pod-churn-kill.yaml
+              test_type: pod-churn-kill
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Combined scale-cycle + kill in one CL2 invocation per cluster.
+            # Maximizes signal per (expensive) n20 provision/destroy lifecycle.
+            # Kill phase uses Method: Exec → kubectl from inside the CL2
+            # container (no in-cluster Job, no AcrPull dependency). If kubectl
+            # is unavailable in the CL2 image, the kill measurement is marked
+            # failed but scale-phase data still lands cleanly.
+            n20_pod_churn_combined:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: pod-churn-combined.yaml
+              test_type: pod-churn-combined
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #3 (Node Churn / IP Churn) at n=20.
+            # Each entry is a separate provision/destroy lifecycle (~6.5h
+            # at n=20 including the ~30-55min node-churn window itself).
+            # Enable selectively in AzDO UI.
+            n20_node_churn_scale:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: node-churn-scale.yaml
+              test_type: node-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_scale_duration_seconds: 1800
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n20_node_churn_replace:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: node-churn-replace.yaml
+              test_type: node-churn-replace
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_settle_seconds: 60
+              node_churn_replace_duration_seconds: 1500
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n20_node_churn_combined:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: node-churn-combined.yaml
+              test_type: node-churn-combined
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_combined_duration_seconds: 3300
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #6 (Upper Bound / Saturation) at n=20.
+            # Highest mesh-pressure tier. Default thresholds calibrated on
+            # lower tiers; expect more rungs to trip at n=20 (more peers
+            # to propagate to per event). First n=20 run is the
+            # ground-truth calibration data point.
+            n20_upper_bound:
+              cluster_count: 20
+              mesh_size: 20
+              cl2_config_file: upper-bound.yaml
+              test_type: upper-bound
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              saturation_qps_list: "100,500,1500,4000,10000"
+              saturation_restarts_list: "2,4,8,15,25"
+              saturation_rung_duration_seconds: 240
+              saturation_settle_seconds: 90
+              trigger_reason: ${{ variables['Build.Reason'] }}
+          max_parallel: 1
+          timeout_in_minutes: 480
+          credential_type: service_connection
+          ssh_key_enabled: false
+          skip_publish: false
diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml
index 38ea068658..8f88419935 100644
--- a/pipelines/system/new-pipeline-test.yml
+++ b/pipelines/system/new-pipeline-test.yml
@@ -16,6 +16,10 @@ variables:
   OWNER: aks
 
 stages:
+  # 2026-05-13: Phase 4b smoke at n=2 to validate Option B++ fix
+  # (execute always exit 0 + SucceededWithIssues marker) + soft-fail
+  # killer + 240s recovery timeout. Re-disable n=2 + enable n=20 once
+  # this lands clean.
   - stage: azure_eastus2euap
     dependsOn: []
     jobs:
@@ -42,25 +46,463 @@ stages:
             # entry. We don't run it in dev — n2_event_throughput already exercises
             # the full plumbing and per-run cost (full Fleet/AKS lifecycle ~15-20 min)
             # makes a second axis expensive during iteration.
-            n2_event_throughput:
+            # SMOKE-ONLY 2026-05-11: Phase 4a n=2 smoke runs ONLY the combined
+            # entry. The other 3 entries (event_throughput, pod_churn_scale,
+            # pod_churn_kill) are commented out so a triggered run doesn't
+            # spend 4× the lifecycle cost. Uncomment after n=2 smoke is green
+            # to restore full coverage (each entry is one provision/destroy).
+            # n2_event_throughput:
+            #   cluster_count: 2
+            #   mesh_size: 2
+            #   cl2_config_file: event-throughput.yaml
+            #   test_type: event-throughput
+            #   namespaces: 5
+            #   deployments_per_namespace: 4
+            #   replicas_per_deployment: 10
+            #   hold_duration: 2m
+            #   warmup_duration: 30s
+            #   restart_count: 1
+            #   api_server_calls_per_second: 20
+            #   trigger_reason: ${{ variables['Build.Reason'] }}
+            # n2_pod_churn_scale:
+            #   cluster_count: 2
+            #   mesh_size: 2
+            #   cl2_config_file: pod-churn-scale.yaml
+            #   test_type: pod-churn-scale
+            #   namespaces: 5
+            #   deployments_per_namespace: 4
+            #   replicas_per_deployment: 10
+            #   hold_duration: 2m
+            #   warmup_duration: 30s
+            #   restart_count: 0
+            #   api_server_calls_per_second: 20
+            #   churn_cycles: 5
+            #   churn_up_duration: 60s
+            #   churn_down_duration: 60s
+            #   trigger_reason: ${{ variables['Build.Reason'] }}
+            # n2_pod_churn_kill:
+            #   cluster_count: 2
+            #   mesh_size: 2
+            #   cl2_config_file: pod-churn-kill.yaml
+            #   test_type: pod-churn-kill
+            #   namespaces: 5
+            #   deployments_per_namespace: 4
+            #   replicas_per_deployment: 10
+            #   hold_duration: 2m
+            #   warmup_duration: 30s
+            #   restart_count: 0
+            #   api_server_calls_per_second: 20
+            #   kill_duration: 10m
+            #   kill_duration_seconds: 600
+            #   kill_interval_seconds: 10
+            #   kill_batch: 5
+            #   kill_job_deadline_seconds: 660
+            #   trigger_reason: ${{ variables['Build.Reason'] }}
+            # Combined scale-cycle + kill in one CL2 invocation per cluster.
+            # Kill phase uses Method: Exec → kubectl from inside the CL2
+            # container (no in-cluster Job, no AcrPull dependency).
+            # SMOKE-ONLY 2026-05-12: commented out for n=2 share-infra smoke;
+            # uncomment for solo-scenario iteration.
+            # n2_pod_churn_combined:
+            #   cluster_count: 2
+            #   mesh_size: 2
+            #   cl2_config_file: pod-churn-combined.yaml
+            #   test_type: pod-churn-combined
+            #   namespaces: 5
+            #   deployments_per_namespace: 4
+            #   replicas_per_deployment: 10
+            #   hold_duration: 2m
+            #   warmup_duration: 30s
+            #   restart_count: 0
+            #   api_server_calls_per_second: 20
+            #   churn_cycles: 5
+            #   churn_up_duration: 60s
+            #   churn_down_duration: 60s
+            #   kill_duration: 10m
+            #   kill_duration_seconds: 600
+            #   kill_interval_seconds: 10
+            #   kill_batch: 5
+            #   kill_job_deadline_seconds: 660
+            #   trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b share-infra: ONE matrix entry runs BOTH scenarios
+            # sequentially against the same provisioned clusters. The
+            # share_infra_scenarios env var (auto-exported as
+            # SHARE_INFRA_SCENARIOS by AzDO) triggers the multi-scenario
+            # path in execute.yml + collect.yml. Per-row test_type
+            # attribution preserved in the JSONL. Single provision/destroy
+            # = ~92% time reduction vs running two matrix entries.
+            #
+            # ITER-ONLY 2026-05-14: commented out for scenario #6 smoke.
+            # n2_shared was previously narrowed to "node-churn-combined"
+            # for #3 iteration; #3 is now green at K=10 (build 67185) so
+            # there's no need to re-run it alongside the #6 first smoke.
+            # Restore + widen this entry to the 5-scenario share-infra
+            # list AFTER #6 lands (planned post-#6 work per SETTLED DESIGN):
+            #   share_infra_scenarios: "event-throughput,pod-churn-combined,apiserver-failure,ha-config,isolation"
+            # n2_shared:
+            #   cluster_count: 2
+            #   mesh_size: 2
+            #   # Phase 4b — 5-scenario share-infra validation:
+            #   # event-throughput (#1), pod-churn-combined (#2),
+            #   # apiserver-failure (#4), ha-config (#7), isolation (#5),
+            #   # node-churn-combined (#3).
+            #   # ha-config is BEFORE isolation so its scale-down restores
+            #   # the apiserver Deployment to 1 replica before isolation's
+            #   # heavy pod-churn loop runs on the target cluster.
+            #   # node-churn-combined is LAST per rubber-duck design review
+            #   # #11 — node ops can leave the target cluster in a half-
+            #   # scaled state if the finalizer can't restore. Putting
+            #   # node-churn last means contamination affects no further
+            #   # scenarios in the share-infra lifecycle.
+            #   share_infra_scenarios: "node-churn-combined"
+            #   cl2_config_file: ""  # unused when share_infra_scenarios is set
+            #   test_type: shared    # row-level test_type comes from each scenario at collect time
+            #   namespaces: 5
+            #   deployments_per_namespace: 4
+            #   replicas_per_deployment: 10
+            #   hold_duration: 2m
+            #   warmup_duration: 30s
+            #   restart_count: 1
+            #   api_server_calls_per_second: 20
+            #   churn_cycles: 5
+            #   churn_up_duration: 60s
+            #   churn_down_duration: 60s
+            #   kill_duration: 10m
+            #   kill_duration_seconds: 600
+            #   kill_interval_seconds: 10
+            #   kill_batch: 5
+            #   kill_job_deadline_seconds: 660
+            #   # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs.
+            #   apiserver_kill_target_context: clustermesh-1
+            #   apiserver_kill_recovery_timeout_seconds: 240
+            #   apiserver_kill_observation_seconds: 60
+            #   # Phase 4b — Scenario #7 (HA Configuration Validation) knob.
+            #   ha_config_replicas: 3
+            #   # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs.
+            #   node_churn_target_context: clustermesh-1
+            #   node_churn_cycles: 2
+            #   node_churn_delta: 3
+            #   node_churn_settle_seconds: 60
+            #   node_churn_scale_duration_seconds: 1500
+            #   node_churn_replace_duration_seconds: 1500
+            #   node_churn_combined_duration_seconds: 2700
+            #   node_replace_batch_size: 10
+            #   node_churn_ready_timeout_seconds: 300
+            #   trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4b — Scenario #6 (Upper Bound / Saturation) standalone
+            # smoke entry. Per SETTLED DESIGN in plan.md (line ~126), we do
+            # NOT widen n2_shared to include #6 — the share-infra-list
+            # rollup happens AFTER #6 lands. CL2 image, tfvars, and timeout
+            # budget are identical to the prod pipeline so signals are
+            # directly comparable.
+            n2_upper_bound:
               cluster_count: 2
               mesh_size: 2
-              cl2_config_file: event-throughput.yaml
-              test_type: event-throughput
+              cl2_config_file: upper-bound.yaml
+              test_type: upper-bound
               namespaces: 5
               deployments_per_namespace: 4
               replicas_per_deployment: 10
               hold_duration: 2m
               warmup_duration: 30s
-              restart_count: 1
+              restart_count: 0
               api_server_calls_per_second: 20
+              saturation_qps_list: "100,500,1500,4000,10000"
+              saturation_restarts_list: "2,4,8,15,25"
+              saturation_rung_duration_seconds: 240
+              saturation_settle_seconds: 90
               trigger_reason: ${{ variables['Build.Reason'] }}
           max_parallel: 1
-          timeout_in_minutes: 120
+          # n=2 share-infra (5 scenarios): provision (~15min) + validate (~5min)
+          # + 5 × CL2 (~25min each, with 60s settle between) + destroy (~15min)
+          # ≈ ~170min. Buffer to 360 for LB-tail / apply retries.
+          # The n2_upper_bound entry runs the same provision/destroy
+          # lifecycle but its CL2 phase is ~16min (4 rungs × 240s); same
+          # 360min budget covers both with headroom.
+          timeout_in_minutes: 360
           credential_type: service_connection
           ssh_key_enabled: false
           # Iteration-only: skip uploading results to the telescope blob while
           # we're still stabilizing the clustermesh-scale pipeline. Mirrors the
           # same flag in pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml.
           # Flip to false (or remove) once results are meaningful.
-          skip_publish: true
+          skip_publish: false
+
+  # Phase 3 — 5-cluster tier. Separate stage (not a matrix entry) because
+  # `terraform_input_file_mapping` is set at the job level, so different
+  # cluster counts require different stages bound to different tfvars files.
+  # Runs in parallel with the n2 stage when pool capacity allows; comment
+  # out either stage during iteration if the dual cost matters.
+  - stage: azure_eastus2euap_n5
+    dependsOn: []
+    # ITER-DISABLED 2026-05-08 (inline comments on `condition:` are unsafe —
+    # AzDO doesn't always strip them, leaving the truthy string
+    # "false # ..." as the expression. Keep the marker on its own line.)
+    condition: false
+    jobs:
+      - template: /jobs/competitive-test.yml
+        parameters:
+          cloud: azure
+          regions:
+            - eastus2euap
+          engine: clusterloader2
+          engine_input:
+            image: "ghcr.io/azure/clusterloader2:v20250513"
+            install: false
+            operation_timeout: 15m
+          topology: clustermesh-scale
+          terraform_input_file_mapping:
+            - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars"
+          matrix:
+            n5_event_throughput:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: event-throughput.yaml
+              test_type: event-throughput
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 1
+              api_server_calls_per_second: 20
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4a — Scenario #2 (Pod Churn Stress).
+            n5_pod_churn_scale:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: pod-churn-scale.yaml
+              test_type: pod-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n5_pod_churn_kill:
+              cluster_count: 5
+              mesh_size: 5
+              cl2_config_file: pod-churn-kill.yaml
+              test_type: pod-churn-kill
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              trigger_reason: ${{ variables['Build.Reason'] }}
+          max_parallel: 1
+          timeout_in_minutes: 180
+          credential_type: service_connection
+          ssh_key_enabled: false
+          skip_publish: false
+
+  # Phase 3 — 10-cluster tier. Per-cluster sizing identical to n2/n5;
+  # only mesh size scales. Quota footprint per run: ~120 vCPU
+  # (10x default-pool D4s_v5 + 10x prompool D8s_v3). 90 VNet peerings.
+  - stage: azure_eastus2euap_n10
+    dependsOn: []
+    # ITER-DISABLED 2026-05-08
+    condition: false
+    # Lower terraform apply parallelism from default 10 to 4. At default,
+    # all 10 `az aks create` calls fire simultaneously and the regional AKS
+    # RP throttles severely — observed N=10 first run had every cluster
+    # stuck in `aks_cli: Still creating` for 190+ min (vs. 5-10 min normal).
+    # Parallelism=4 lets the RP process creates in batches: roughly
+    # 4-create wave (~10 min) then 4-create wave then 2-create wave →
+    # ~30 min total apply instead of 4hr+. CL2 fan-out parallelism
+    # (max_concurrent=4) is a SEPARATE knob and stays unchanged. Destroy
+    # is unaffected (uses TF_CLI_ARGS_apply, not TF_CLI_ARGS).
+    variables:
+      TF_CLI_ARGS_apply: "-parallelism=4"
+    jobs:
+      - template: /jobs/competitive-test.yml
+        parameters:
+          cloud: azure
+          regions:
+            - eastus2euap
+          engine: clusterloader2
+          engine_input:
+            image: "ghcr.io/azure/clusterloader2:v20250513"
+            install: false
+            operation_timeout: 15m
+          topology: clustermesh-scale
+          terraform_input_file_mapping:
+            - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars"
+          matrix:
+            n10_event_throughput:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: event-throughput.yaml
+              test_type: event-throughput
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 1
+              api_server_calls_per_second: 20
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            # Phase 4a — Scenario #2 (Pod Churn Stress).
+            n10_pod_churn_scale:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: pod-churn-scale.yaml
+              test_type: pod-churn-scale
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              trigger_reason: ${{ variables['Build.Reason'] }}
+            n10_pod_churn_kill:
+              cluster_count: 10
+              mesh_size: 10
+              cl2_config_file: pod-churn-kill.yaml
+              test_type: pod-churn-kill
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 0
+              api_server_calls_per_second: 20
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              trigger_reason: ${{ variables['Build.Reason'] }}
+          max_parallel: 1
+          # 10-cluster provision adds ~10-15 min vs n5 (more terraform +
+          # fleet member creates + ARM throughput); CL2 fan-out itself
+          # stays bounded at concurrency 4 (10/4 batches sequentially).
+          timeout_in_minutes: 240
+          credential_type: service_connection
+          ssh_key_enabled: false
+          skip_publish: false
+
+  # Phase 3 — 20-cluster tier (final scale-test point per spec line 25).
+  # Per-cluster sizing identical to lower tiers; only mesh size scales.
+  # Quota footprint per run (validated 2026-05-08 in eastus2euap with
+  # 78k vCPU headroom): ~320 vCPU (20x D4s_v5 + 20x D8s_v3). 380 VNet
+  # peering links (N*(N-1) at separate-VNet mode). 20 Fleet members.
+  #
+  # TF_CLI_ARGS_apply tuning history at this tier:
+  #   - default parallelism=10 (aks-cli implicit): cluster-create RP throttle,
+  #     all 20 stuck "Still creating" for hours.
+  #   - parallelism=4 (first n20 attempt 2026-05-09): apply 219 min (3.65 hr).
+  #     Real bottleneck shifts from AKS RP to terraform graph traversal of
+  #     520+ resources (380 peerings + 20 fleet members + per-cluster waits).
+  #   - parallelism=8 (this run): split-the-difference. Cluster-creates still
+  #     batch (20/8 = ~3 batches), but graph traversal of peerings/members is
+  #     2x faster than parallelism=4. Risk: AKS RP could throttle harder than
+  #     parallelism=4. Fallback if this fails: drop back to parallelism=4.
+  - stage: azure_eastus2euap_n20
+    dependsOn: []
+    # ITER-DISABLED 2026-05-13: Phase 4b smoke at n=2 first to validate
+    # the Option B++ exit-0+SucceededWithIssues fix. Re-enable when
+    # ready to promote.
+    condition: false
+    variables:
+      TF_CLI_ARGS_apply: "-parallelism=8"
+    jobs:
+      - template: /jobs/competitive-test.yml
+        parameters:
+          cloud: azure
+          regions:
+            - eastus2euap
+          engine: clusterloader2
+          engine_input:
+            image: "ghcr.io/azure/clusterloader2:v20250513"
+            install: false
+            operation_timeout: 15m
+          topology: clustermesh-scale
+          terraform_input_file_mapping:
+            - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars"
+          matrix:
+            # Phase 4b — n=20 share-infra overnight run.
+            # Runs 3 scenarios in ONE provision/destroy lifecycle:
+            #   1. event-throughput (scenario #1 baseline with CFP-39876 fix)
+            #   2. pod-churn-combined (scenario #2 scale + kill phases)
+            #   3. apiserver-failure (scenario #4 — Phase 4b's new scenario)
+            # Compresses what would be 3 × 6h = 18h of separate lifecycles
+            # into ~7-8h shared.
+            #
+            # cl2_max_concurrent=8: bumped from default 4 so more peer
+            # clusters' Prometheus are running during scenario #4's kill
+            # window. At default 4, only 3 of 19 peers would be in flight
+            # when mesh-1 is killed. At 8: ~7 peers. Marginal agent memory
+            # increase, much better peer coverage.
+            #
+            # SMOKE-ONLY: solo-scenario matrix entries below commented out
+            # so this overnight run produces exactly one results blob from
+            # the shared lifecycle. Uncomment for solo iteration.
+            # n20_event_throughput: ...
+            # n20_pod_churn_combined: ...
+            n20_shared:
+              cluster_count: 20
+              mesh_size: 20
+              share_infra_scenarios: "event-throughput,pod-churn-combined,apiserver-failure,ha-config,isolation,node-churn-combined"
+              cl2_config_file: ""  # unused in share-infra mode
+              test_type: shared    # row-level test_type comes from each scenario
+              cl2_max_concurrent: 8
+              namespaces: 5
+              deployments_per_namespace: 4
+              replicas_per_deployment: 10
+              hold_duration: 2m
+              warmup_duration: 30s
+              restart_count: 1
+              api_server_calls_per_second: 20
+              churn_cycles: 5
+              churn_up_duration: 60s
+              churn_down_duration: 60s
+              kill_duration: 10m
+              kill_duration_seconds: 600
+              kill_interval_seconds: 10
+              kill_batch: 5
+              kill_job_deadline_seconds: 660
+              apiserver_kill_target_context: clustermesh-1
+              apiserver_kill_recovery_timeout_seconds: 240
+              apiserver_kill_observation_seconds: 60
+              ha_config_replicas: 3
+              # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs at n=20.
+              # Positioned LAST in share_infra_scenarios per rubber-duck
+              # design review #11 (node ops can leave target half-scaled
+              # if finalizer can't restore; putting it last contains the
+              # blast radius).
+              node_churn_target_context: clustermesh-1
+              node_churn_cycles: 3
+              node_churn_delta: 5
+              node_churn_settle_seconds: 60
+              node_churn_scale_duration_seconds: 1800
+              node_churn_replace_duration_seconds: 1500
+              node_churn_combined_duration_seconds: 3300
+              node_replace_batch_size: 10
+              node_churn_ready_timeout_seconds: 300
+              trigger_reason: ${{ variables['Build.Reason'] }}
+          max_parallel: 1
+          # n=20 share-infra (3 scenarios): provision (~4h) + validate (~30min)
+          # + 3 × CL2 (~25min each, with 60s settle between) + destroy (~1.5h)
+          # ≈ ~7.5h baseline. Phase 4a's last n=20 hit 480 min during destroy
+          # so we go to 720 (12h) for safe overnight headroom.
+          timeout_in_minutes: 720
+          credential_type: service_connection
+          ssh_key_enabled: false
+          skip_publish: false
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars
new file mode 100644
index 0000000000..90e6c7e542
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-10.tfvars
@@ -0,0 +1,579 @@
+scenario_type  = "perf-eval"
+scenario_name  = "clustermesh-scale"
+deletion_delay = "4h"
+owner          = "aks"
+
+# =============================================================================
+# ClusterMesh Scale Test — 10 cluster tier
+#
+# Same shape as azure-2.tfvars (see that file for full sizing rationale on
+# pod CIDR, max-pods, prompool, etc.). This file scales the cluster count
+# only; per-cluster sizing is identical to the n2 tier so cluster-count is
+# the only variable when comparing tier results.
+#
+# Generated topology:
+#   - 10 VNets (one per cluster) at 10.<id>.0.0/16, id=1..10
+#   - 10 AKS clusters (Cilium+ACNS, Azure CNI w/ pod subnet)
+#   - 90 VNet peering links (N*(N-1) at separate-VNet mode)
+#   - 10 Fleet members (label mesh=true) + 1 clustermeshprofile
+#
+# Subscription footprint per run (20-node baseline per spec line 24):
+#   - default pool: 10 clusters x 20 nodes x D4s_v3 (4 vCPU) = 800 vCPU (DSv3 family)
+#   - prompool:     10 clusters x  1 node  x D8s_v3 (8 vCPU) = 80 vCPU (DSv3 family)
+#   - total DSv3 compute: 880 vCPU
+#   Verify region quota before first run (DSv3 limit is typically 5000 vCPU
+#   in eastus2euap; check `az vm list-usage --location eastus2euap`).
+# =============================================================================
+
+network_config_list = [
+  {
+    role               = "mesh-1"
+    vnet_name          = "clustermesh-1-vnet"
+    vnet_address_space = "10.1.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-1-node"
+        address_prefix = "10.1.0.0/24"
+      },
+      {
+        name           = "clustermesh-1-pod"
+        address_prefix = "10.1.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-2"
+    vnet_name          = "clustermesh-2-vnet"
+    vnet_address_space = "10.2.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-2-node"
+        address_prefix = "10.2.0.0/24"
+      },
+      {
+        name           = "clustermesh-2-pod"
+        address_prefix = "10.2.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-3"
+    vnet_name          = "clustermesh-3-vnet"
+    vnet_address_space = "10.3.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-3-node"
+        address_prefix = "10.3.0.0/24"
+      },
+      {
+        name           = "clustermesh-3-pod"
+        address_prefix = "10.3.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-4"
+    vnet_name          = "clustermesh-4-vnet"
+    vnet_address_space = "10.4.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-4-node"
+        address_prefix = "10.4.0.0/24"
+      },
+      {
+        name           = "clustermesh-4-pod"
+        address_prefix = "10.4.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-5"
+    vnet_name          = "clustermesh-5-vnet"
+    vnet_address_space = "10.5.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-5-node"
+        address_prefix = "10.5.0.0/24"
+      },
+      {
+        name           = "clustermesh-5-pod"
+        address_prefix = "10.5.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-6"
+    vnet_name          = "clustermesh-6-vnet"
+    vnet_address_space = "10.6.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-6-node"
+        address_prefix = "10.6.0.0/24"
+      },
+      {
+        name           = "clustermesh-6-pod"
+        address_prefix = "10.6.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-7"
+    vnet_name          = "clustermesh-7-vnet"
+    vnet_address_space = "10.7.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-7-node"
+        address_prefix = "10.7.0.0/24"
+      },
+      {
+        name           = "clustermesh-7-pod"
+        address_prefix = "10.7.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-8"
+    vnet_name          = "clustermesh-8-vnet"
+    vnet_address_space = "10.8.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-8-node"
+        address_prefix = "10.8.0.0/24"
+      },
+      {
+        name           = "clustermesh-8-pod"
+        address_prefix = "10.8.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-9"
+    vnet_name          = "clustermesh-9-vnet"
+    vnet_address_space = "10.9.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-9-node"
+        address_prefix = "10.9.0.0/24"
+      },
+      {
+        name           = "clustermesh-9-pod"
+        address_prefix = "10.9.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-10"
+    vnet_name          = "clustermesh-10-vnet"
+    vnet_address_space = "10.10.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-10-node"
+        address_prefix = "10.10.0.0/24"
+      },
+      {
+        name           = "clustermesh-10-pod"
+        address_prefix = "10.10.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  }
+]
+
+aks_cli_config_list = [
+  {
+    role                          = "mesh-1"
+    aks_name                      = "clustermesh-1"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-1-node"
+    pod_subnet_name               = "clustermesh-1-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-2"
+    aks_name                      = "clustermesh-2"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-2-node"
+    pod_subnet_name               = "clustermesh-2-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-3"
+    aks_name                      = "clustermesh-3"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-3-node"
+    pod_subnet_name               = "clustermesh-3-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-4"
+    aks_name                      = "clustermesh-4"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-4-node"
+    pod_subnet_name               = "clustermesh-4-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-5"
+    aks_name                      = "clustermesh-5"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-5-node"
+    pod_subnet_name               = "clustermesh-5-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-6"
+    aks_name                      = "clustermesh-6"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-6-node"
+    pod_subnet_name               = "clustermesh-6-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-7"
+    aks_name                      = "clustermesh-7"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-7-node"
+    pod_subnet_name               = "clustermesh-7-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-8"
+    aks_name                      = "clustermesh-8"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-8-node"
+    pod_subnet_name               = "clustermesh-8-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-9"
+    aks_name                      = "clustermesh-9"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-9-node"
+    pod_subnet_name               = "clustermesh-9-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-10"
+    aks_name                      = "clustermesh-10"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-10-node"
+    pod_subnet_name               = "clustermesh-10-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  }
+]
+
+# =============================================================================
+# Fleet + ClusterMesh
+# =============================================================================
+vnet_peering_config = {
+  enabled = true
+}
+
+fleet_config = {
+  enabled            = true
+  fleet_name         = "clustermesh-flt"
+  cmp_name           = "clustermesh-cmp"
+  member_label_key   = "mesh"
+  member_label_value = "true"
+  members = [
+    { member_name = "mesh-1", aks_role = "mesh-1" },
+    { member_name = "mesh-2", aks_role = "mesh-2" },
+    { member_name = "mesh-3", aks_role = "mesh-3" },
+    { member_name = "mesh-4", aks_role = "mesh-4" },
+    { member_name = "mesh-5", aks_role = "mesh-5" },
+    { member_name = "mesh-6", aks_role = "mesh-6" },
+    { member_name = "mesh-7", aks_role = "mesh-7" },
+    { member_name = "mesh-8", aks_role = "mesh-8" },
+    { member_name = "mesh-9", aks_role = "mesh-9" },
+    { member_name = "mesh-10", aks_role = "mesh-10" }
+  ]
+}
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars
index 535bdba5a7..fcc90c2bb9 100644
--- a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars
@@ -91,14 +91,28 @@ aks_cli_config_list = [
       { name = "max-pods", value = "110" },
     ]
 
-    # Default pool sizing: D4s_v5 (4 vCPU / 16GB) is enough for the workload
-    # pods alone. Prometheus is pinned to prompool below — without that
-    # split, Prometheus's 1Gi+ memory request co-tenanting on default-pool
-    # nodes caused per-node CPU overcommit (~160% allocatable) and left
-    # workload pods stuck Pending.
+    # Default pool sizing: 20 nodes × D4ds_v4 (4 vCPU / 16GB).
+    #
+    # 20 nodes per cluster is the spec baseline (scale testing.txt line 24:
+    # "20-node clusters as the baseline unit"). Workload sits on this pool;
+    # Prometheus is pinned to prompool below to avoid the per-node CPU
+    # overcommit + Pending-pods we hit when Prometheus co-tenanted with the
+    # workload at smaller node counts.
+    #
+    # SKU choice — D4s_v5 (iter-narrow for scenario #6 smoke 2026-05-15
+    # subscription switch): 4 vCPU / 16GB / Premium SSD, Ice Lake v5
+    # generation. Switched from D4ds_v4 because we moved this pipeline to
+    # subscription 37deca37-... ("Azure Network Agent - Standalone Test")
+    # to dodge RG-count quota pressure on the original 9b8218f9-...
+    # subscription. On 37deca37 the DDSv4 family has only 100 vCPU quota
+    # (need 160+ at n=2), but DSv5 has 1000 vCPU quota with 920 free, so
+    # D4s_v5/D8s_v5 fits with headroom. Larger tiers (n5/n10/n20) still
+    # need quota planning on the new sub before promotion.
+    # Performance for our workload (mostly idle pause pods + cilium-agent
+    # + CL2 measurement client) is not bound on CPU generation.
     default_node_pool = {
       name                 = "default"
-      node_count           = 2
+      node_count           = 20
       auto_scaling_enabled = false
       vm_size              = "Standard_D4s_v5"
     }
@@ -108,15 +122,15 @@ aks_cli_config_list = [
     # only on this label, so it doesn't compete with workload pods. Mirrors
     # the `prompool` pattern from
     # scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars.
-    # D8s_v3 (8 vCPU / 32GB) is sized for our 1Gi-request Prometheus with
-    # ample headroom — much smaller than #1053's D32s_v5 because our
-    # workload spec is also much smaller.
+    # D8s_v5 (8 vCPU / 32GB) is sized for our 1Gi-request Prometheus with
+    # ample headroom; matches the family swap of the default pool (DSv5
+    # quota of 1000 vCPU on subscription 37deca37 fits n=2 with margin).
     extra_node_pool = [
       {
         name                 = "prompool"
         node_count           = 1
         auto_scaling_enabled = false
-        vm_size              = "Standard_D8s_v3"
+        vm_size              = "Standard_D8s_v5"
         optional_parameters = [
           { name = "labels", value = "prometheus=true" },
         ]
@@ -141,7 +155,7 @@ aks_cli_config_list = [
 
     default_node_pool = {
       name                 = "default"
-      node_count           = 2
+      node_count           = 20
       auto_scaling_enabled = false
       vm_size              = "Standard_D4s_v5"
     }
@@ -150,7 +164,7 @@ aks_cli_config_list = [
         name                 = "prompool"
         node_count           = 1
         auto_scaling_enabled = false
-        vm_size              = "Standard_D8s_v3"
+        vm_size              = "Standard_D8s_v5"
         optional_parameters = [
           { name = "labels", value = "prometheus=true" },
         ]
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars
new file mode 100644
index 0000000000..26a94dbabd
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-20.tfvars
@@ -0,0 +1,1109 @@
+scenario_type  = "perf-eval"
+scenario_name  = "clustermesh-scale"
+deletion_delay = "4h"
+owner          = "aks"
+
+# =============================================================================
+# ClusterMesh Scale Test — 20 cluster tier
+#
+# Same shape as azure-2.tfvars (see that file for full sizing rationale on
+# pod CIDR, max-pods, prompool, etc.). This file scales the cluster count
+# only; per-cluster sizing is identical to the n2 tier so cluster-count is
+# the only variable when comparing tier results.
+#
+# Generated topology:
+#   - 20 VNets (one per cluster) at 10.<id>.0.0/16, id=1..20
+#   - 20 AKS clusters (Cilium+ACNS, Azure CNI w/ pod subnet)
+#   - 380 VNet peering links (N*(N-1) at separate-VNet mode)
+#   - 20 Fleet members (label mesh=true) + 1 clustermeshprofile
+#
+# Subscription footprint per run (20-node baseline per spec line 24):
+#   - default pool: 20 clusters x 20 nodes x D4s_v3 (4 vCPU) = 1600 vCPU (DSv3 family)
+#   - prompool:     20 clusters x  1 node  x D8s_v3 (8 vCPU) = 160 vCPU (DSv3 family)
+#   - total DSv3 compute: 1760 vCPU
+#   Verify region quota before first run (DSv3 limit is typically 5000 vCPU
+#   in eastus2euap; check `az vm list-usage --location eastus2euap`).
+# =============================================================================
+
+network_config_list = [
+  {
+    role               = "mesh-1"
+    vnet_name          = "clustermesh-1-vnet"
+    vnet_address_space = "10.1.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-1-node"
+        address_prefix = "10.1.0.0/24"
+      },
+      {
+        name           = "clustermesh-1-pod"
+        address_prefix = "10.1.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-2"
+    vnet_name          = "clustermesh-2-vnet"
+    vnet_address_space = "10.2.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-2-node"
+        address_prefix = "10.2.0.0/24"
+      },
+      {
+        name           = "clustermesh-2-pod"
+        address_prefix = "10.2.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-3"
+    vnet_name          = "clustermesh-3-vnet"
+    vnet_address_space = "10.3.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-3-node"
+        address_prefix = "10.3.0.0/24"
+      },
+      {
+        name           = "clustermesh-3-pod"
+        address_prefix = "10.3.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-4"
+    vnet_name          = "clustermesh-4-vnet"
+    vnet_address_space = "10.4.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-4-node"
+        address_prefix = "10.4.0.0/24"
+      },
+      {
+        name           = "clustermesh-4-pod"
+        address_prefix = "10.4.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-5"
+    vnet_name          = "clustermesh-5-vnet"
+    vnet_address_space = "10.5.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-5-node"
+        address_prefix = "10.5.0.0/24"
+      },
+      {
+        name           = "clustermesh-5-pod"
+        address_prefix = "10.5.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-6"
+    vnet_name          = "clustermesh-6-vnet"
+    vnet_address_space = "10.6.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-6-node"
+        address_prefix = "10.6.0.0/24"
+      },
+      {
+        name           = "clustermesh-6-pod"
+        address_prefix = "10.6.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-7"
+    vnet_name          = "clustermesh-7-vnet"
+    vnet_address_space = "10.7.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-7-node"
+        address_prefix = "10.7.0.0/24"
+      },
+      {
+        name           = "clustermesh-7-pod"
+        address_prefix = "10.7.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-8"
+    vnet_name          = "clustermesh-8-vnet"
+    vnet_address_space = "10.8.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-8-node"
+        address_prefix = "10.8.0.0/24"
+      },
+      {
+        name           = "clustermesh-8-pod"
+        address_prefix = "10.8.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-9"
+    vnet_name          = "clustermesh-9-vnet"
+    vnet_address_space = "10.9.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-9-node"
+        address_prefix = "10.9.0.0/24"
+      },
+      {
+        name           = "clustermesh-9-pod"
+        address_prefix = "10.9.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-10"
+    vnet_name          = "clustermesh-10-vnet"
+    vnet_address_space = "10.10.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-10-node"
+        address_prefix = "10.10.0.0/24"
+      },
+      {
+        name           = "clustermesh-10-pod"
+        address_prefix = "10.10.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-11"
+    vnet_name          = "clustermesh-11-vnet"
+    vnet_address_space = "10.11.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-11-node"
+        address_prefix = "10.11.0.0/24"
+      },
+      {
+        name           = "clustermesh-11-pod"
+        address_prefix = "10.11.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-12"
+    vnet_name          = "clustermesh-12-vnet"
+    vnet_address_space = "10.12.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-12-node"
+        address_prefix = "10.12.0.0/24"
+      },
+      {
+        name           = "clustermesh-12-pod"
+        address_prefix = "10.12.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-13"
+    vnet_name          = "clustermesh-13-vnet"
+    vnet_address_space = "10.13.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-13-node"
+        address_prefix = "10.13.0.0/24"
+      },
+      {
+        name           = "clustermesh-13-pod"
+        address_prefix = "10.13.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-14"
+    vnet_name          = "clustermesh-14-vnet"
+    vnet_address_space = "10.14.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-14-node"
+        address_prefix = "10.14.0.0/24"
+      },
+      {
+        name           = "clustermesh-14-pod"
+        address_prefix = "10.14.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-15"
+    vnet_name          = "clustermesh-15-vnet"
+    vnet_address_space = "10.15.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-15-node"
+        address_prefix = "10.15.0.0/24"
+      },
+      {
+        name           = "clustermesh-15-pod"
+        address_prefix = "10.15.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-16"
+    vnet_name          = "clustermesh-16-vnet"
+    vnet_address_space = "10.16.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-16-node"
+        address_prefix = "10.16.0.0/24"
+      },
+      {
+        name           = "clustermesh-16-pod"
+        address_prefix = "10.16.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-17"
+    vnet_name          = "clustermesh-17-vnet"
+    vnet_address_space = "10.17.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-17-node"
+        address_prefix = "10.17.0.0/24"
+      },
+      {
+        name           = "clustermesh-17-pod"
+        address_prefix = "10.17.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-18"
+    vnet_name          = "clustermesh-18-vnet"
+    vnet_address_space = "10.18.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-18-node"
+        address_prefix = "10.18.0.0/24"
+      },
+      {
+        name           = "clustermesh-18-pod"
+        address_prefix = "10.18.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-19"
+    vnet_name          = "clustermesh-19-vnet"
+    vnet_address_space = "10.19.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-19-node"
+        address_prefix = "10.19.0.0/24"
+      },
+      {
+        name           = "clustermesh-19-pod"
+        address_prefix = "10.19.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-20"
+    vnet_name          = "clustermesh-20-vnet"
+    vnet_address_space = "10.20.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-20-node"
+        address_prefix = "10.20.0.0/24"
+      },
+      {
+        name           = "clustermesh-20-pod"
+        address_prefix = "10.20.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  }
+]
+
+aks_cli_config_list = [
+  {
+    role                          = "mesh-1"
+    aks_name                      = "clustermesh-1"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-1-node"
+    pod_subnet_name               = "clustermesh-1-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-2"
+    aks_name                      = "clustermesh-2"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-2-node"
+    pod_subnet_name               = "clustermesh-2-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-3"
+    aks_name                      = "clustermesh-3"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-3-node"
+    pod_subnet_name               = "clustermesh-3-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-4"
+    aks_name                      = "clustermesh-4"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-4-node"
+    pod_subnet_name               = "clustermesh-4-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-5"
+    aks_name                      = "clustermesh-5"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-5-node"
+    pod_subnet_name               = "clustermesh-5-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-6"
+    aks_name                      = "clustermesh-6"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-6-node"
+    pod_subnet_name               = "clustermesh-6-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-7"
+    aks_name                      = "clustermesh-7"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-7-node"
+    pod_subnet_name               = "clustermesh-7-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-8"
+    aks_name                      = "clustermesh-8"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-8-node"
+    pod_subnet_name               = "clustermesh-8-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-9"
+    aks_name                      = "clustermesh-9"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-9-node"
+    pod_subnet_name               = "clustermesh-9-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-10"
+    aks_name                      = "clustermesh-10"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-10-node"
+    pod_subnet_name               = "clustermesh-10-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-11"
+    aks_name                      = "clustermesh-11"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-11-node"
+    pod_subnet_name               = "clustermesh-11-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-12"
+    aks_name                      = "clustermesh-12"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-12-node"
+    pod_subnet_name               = "clustermesh-12-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-13"
+    aks_name                      = "clustermesh-13"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-13-node"
+    pod_subnet_name               = "clustermesh-13-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-14"
+    aks_name                      = "clustermesh-14"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-14-node"
+    pod_subnet_name               = "clustermesh-14-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-15"
+    aks_name                      = "clustermesh-15"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-15-node"
+    pod_subnet_name               = "clustermesh-15-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-16"
+    aks_name                      = "clustermesh-16"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-16-node"
+    pod_subnet_name               = "clustermesh-16-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-17"
+    aks_name                      = "clustermesh-17"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-17-node"
+    pod_subnet_name               = "clustermesh-17-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-18"
+    aks_name                      = "clustermesh-18"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-18-node"
+    pod_subnet_name               = "clustermesh-18-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-19"
+    aks_name                      = "clustermesh-19"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-19-node"
+    pod_subnet_name               = "clustermesh-19-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-20"
+    aks_name                      = "clustermesh-20"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-20-node"
+    pod_subnet_name               = "clustermesh-20-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  }
+]
+
+# =============================================================================
+# Fleet + ClusterMesh
+# =============================================================================
+vnet_peering_config = {
+  enabled = true
+}
+
+fleet_config = {
+  enabled            = true
+  fleet_name         = "clustermesh-flt"
+  cmp_name           = "clustermesh-cmp"
+  member_label_key   = "mesh"
+  member_label_value = "true"
+  members = [
+    { member_name = "mesh-1", aks_role = "mesh-1" },
+    { member_name = "mesh-2", aks_role = "mesh-2" },
+    { member_name = "mesh-3", aks_role = "mesh-3" },
+    { member_name = "mesh-4", aks_role = "mesh-4" },
+    { member_name = "mesh-5", aks_role = "mesh-5" },
+    { member_name = "mesh-6", aks_role = "mesh-6" },
+    { member_name = "mesh-7", aks_role = "mesh-7" },
+    { member_name = "mesh-8", aks_role = "mesh-8" },
+    { member_name = "mesh-9", aks_role = "mesh-9" },
+    { member_name = "mesh-10", aks_role = "mesh-10" },
+    { member_name = "mesh-11", aks_role = "mesh-11" },
+    { member_name = "mesh-12", aks_role = "mesh-12" },
+    { member_name = "mesh-13", aks_role = "mesh-13" },
+    { member_name = "mesh-14", aks_role = "mesh-14" },
+    { member_name = "mesh-15", aks_role = "mesh-15" },
+    { member_name = "mesh-16", aks_role = "mesh-16" },
+    { member_name = "mesh-17", aks_role = "mesh-17" },
+    { member_name = "mesh-18", aks_role = "mesh-18" },
+    { member_name = "mesh-19", aks_role = "mesh-19" },
+    { member_name = "mesh-20", aks_role = "mesh-20" }
+  ]
+}
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars
new file mode 100644
index 0000000000..d36788938a
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-5.tfvars
@@ -0,0 +1,314 @@
+scenario_type  = "perf-eval"
+scenario_name  = "clustermesh-scale"
+deletion_delay = "4h"
+owner          = "aks"
+
+# =============================================================================
+# ClusterMesh Scale Test — 5 cluster tier
+#
+# Same shape as azure-2.tfvars (see that file for full sizing rationale on
+# pod CIDR, max-pods, prompool, etc.). This file scales the cluster count
+# only; per-cluster sizing is identical to the n2 tier so cluster-count is
+# the only variable when comparing tier results.
+#
+# Generated topology:
+#   - 5 VNets (one per cluster) at 10.<id>.0.0/16, id=1..5
+#   - 5 AKS clusters (Cilium+ACNS, Azure CNI w/ pod subnet)
+#   - 20 VNet peering links (N*(N-1) at separate-VNet mode)
+#   - 5 Fleet members (label mesh=true) + 1 clustermeshprofile
+#
+# Subscription footprint per run (20-node baseline per spec line 24):
+#   - default pool: 5 clusters x 20 nodes x D4s_v3 (4 vCPU) = 400 vCPU (DSv3 family)
+#   - prompool:     5 clusters x  1 node  x D8s_v3 (8 vCPU) = 40 vCPU (DSv3 family)
+#   - total DSv3 compute: 440 vCPU
+#   Verify region quota before first run (DSv3 limit is typically 5000 vCPU
+#   in eastus2euap; check `az vm list-usage --location eastus2euap`).
+# =============================================================================
+
+network_config_list = [
+  {
+    role               = "mesh-1"
+    vnet_name          = "clustermesh-1-vnet"
+    vnet_address_space = "10.1.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-1-node"
+        address_prefix = "10.1.0.0/24"
+      },
+      {
+        name           = "clustermesh-1-pod"
+        address_prefix = "10.1.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-2"
+    vnet_name          = "clustermesh-2-vnet"
+    vnet_address_space = "10.2.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-2-node"
+        address_prefix = "10.2.0.0/24"
+      },
+      {
+        name           = "clustermesh-2-pod"
+        address_prefix = "10.2.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-3"
+    vnet_name          = "clustermesh-3-vnet"
+    vnet_address_space = "10.3.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-3-node"
+        address_prefix = "10.3.0.0/24"
+      },
+      {
+        name           = "clustermesh-3-pod"
+        address_prefix = "10.3.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-4"
+    vnet_name          = "clustermesh-4-vnet"
+    vnet_address_space = "10.4.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-4-node"
+        address_prefix = "10.4.0.0/24"
+      },
+      {
+        name           = "clustermesh-4-pod"
+        address_prefix = "10.4.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  },
+  {
+    role               = "mesh-5"
+    vnet_name          = "clustermesh-5-vnet"
+    vnet_address_space = "10.5.0.0/16"
+    subnet = [
+      {
+        name           = "clustermesh-5-node"
+        address_prefix = "10.5.0.0/24"
+      },
+      {
+        name           = "clustermesh-5-pod"
+        address_prefix = "10.5.4.0/22"
+      }
+    ]
+    network_security_group_name = ""
+    nic_public_ip_associations  = []
+    nsr_rules                   = []
+  }
+]
+
+aks_cli_config_list = [
+  {
+    role                          = "mesh-1"
+    aks_name                      = "clustermesh-1"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-1-node"
+    pod_subnet_name               = "clustermesh-1-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-2"
+    aks_name                      = "clustermesh-2"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-2-node"
+    pod_subnet_name               = "clustermesh-2-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-3"
+    aks_name                      = "clustermesh-3"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-3-node"
+    pod_subnet_name               = "clustermesh-3-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-4"
+    aks_name                      = "clustermesh-4"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-4-node"
+    pod_subnet_name               = "clustermesh-4-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  },
+  {
+    role                          = "mesh-5"
+    aks_name                      = "clustermesh-5"
+    sku_tier                      = "Standard"
+    subnet_name                   = "clustermesh-5-node"
+    pod_subnet_name               = "clustermesh-5-pod"
+    use_aks_preview_cli_extension = true
+
+    optional_parameters = [
+      { name = "generate-ssh-keys", value = "" },
+      { name = "network-plugin", value = "azure" },
+      { name = "network-dataplane", value = "cilium" },
+      { name = "enable-acns", value = "" },
+      { name = "max-pods", value = "110" },
+    ]
+
+    default_node_pool = {
+      name                 = "default"
+      node_count           = 20
+      auto_scaling_enabled = false
+      vm_size              = "Standard_D4s_v3"
+    }
+    extra_node_pool = [
+      {
+        name                 = "prompool"
+        node_count           = 1
+        auto_scaling_enabled = false
+        vm_size              = "Standard_D8s_v3"
+        optional_parameters = [
+          { name = "labels", value = "prometheus=true" },
+        ]
+      },
+    ]
+  }
+]
+
+# =============================================================================
+# Fleet + ClusterMesh
+# =============================================================================
+vnet_peering_config = {
+  enabled = true
+}
+
+fleet_config = {
+  enabled            = true
+  fleet_name         = "clustermesh-flt"
+  cmp_name           = "clustermesh-cmp"
+  member_label_key   = "mesh"
+  member_label_value = "true"
+  members = [
+    { member_name = "mesh-1", aks_role = "mesh-1" },
+    { member_name = "mesh-2", aks_role = "mesh-2" },
+    { member_name = "mesh-3", aks_role = "mesh-3" },
+    { member_name = "mesh-4", aks_role = "mesh-4" },
+    { member_name = "mesh-5", aks_role = "mesh-5" }
+  ]
+}
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-10.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-10.json
new file mode 100644
index 0000000000..0e2fd02aef
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-10.json
@@ -0,0 +1,4 @@
+{
+  "run_id": "cmesh10test",
+  "region": "westus2"
+}
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-20.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-20.json
new file mode 100644
index 0000000000..fab49e54a0
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-20.json
@@ -0,0 +1,4 @@
+{
+  "run_id": "cmesh20test",
+  "region": "westus2"
+}
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-5.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-5.json
new file mode 100644
index 0000000000..6604113763
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-5.json
@@ -0,0 +1,4 @@
+{
+  "run_id": "cmesh5test",
+  "region": "westus2"
+}
diff --git a/steps/engine/clusterloader2/clustermesh-scale/collect.yml b/steps/engine/clusterloader2/clustermesh-scale/collect.yml
index 6a879a2c58..f6684d297c 100644
--- a/steps/engine/clusterloader2/clustermesh-scale/collect.yml
+++ b/steps/engine/clusterloader2/clustermesh-scale/collect.yml
@@ -26,55 +26,213 @@ steps:
       export MESH_SIZE="${MESH_SIZE:-$CLUSTERMESH_COUNT}"
       export TEST_TYPE="${TEST_TYPE:-default-config}"
       export TRIGGER_REASON="${TRIGGER_REASON:-$BUILD_REASON}"
+      # Phase 4a — pod-churn knobs recorded in each JSONL row so Kusto can
+      # filter/group on the exact stressor parameters. Non-churn matrix
+      # entries leave these unset → fall back to 0/"" defaults that
+      # scale.py collect treats as "not a churn run".
+      export CL2_CHURN_CYCLES="${CHURN_CYCLES:-0}"
+      export CL2_CHURN_UP_DURATION="${CHURN_UP_DURATION:-}"
+      export CL2_CHURN_DOWN_DURATION="${CHURN_DOWN_DURATION:-}"
+      export CL2_KILL_DURATION_SECONDS="${KILL_DURATION_SECONDS:-0}"
+      export CL2_KILL_INTERVAL_SECONDS="${KILL_INTERVAL_SECONDS:-0}"
+      export CL2_KILL_BATCH="${KILL_BATCH:-0}"
+      # Phase 4b — Scenario #5 (Multi-Cluster Failure Isolation) target context.
+      # Reused from scenario #4 by convention; used here to special-case the
+      # per-cluster churn knobs (only the target row carries non-zero kill
+      # values; peer rows carry zeros even though the share-infra scenario
+      # was configured with churn knobs).
+      export CL2_APISERVER_KILL_TARGET_CONTEXT="${APISERVER_KILL_TARGET_CONTEXT:-clustermesh-1}"
+      # Phase 4b — Scenario #6 (Upper Bound / Saturation) collect knobs.
+      # Default to empty string so non-saturation test_types skip the
+      # classifier entirely (zero overhead). For upper-bound test_types,
+      # the matrix sets these → scale.py collect emits SaturationRung +
+      # SaturationSummary rows tagging which signal tripped per rung.
+      export CL2_SATURATION_QPS_LIST="${SATURATION_QPS_LIST:-}"
+      export CL2_SATURATION_RESTARTS_LIST="${SATURATION_RESTARTS_LIST:-}"
 
       clusters=$(cat "$HOME/.kube/clustermesh-clusters.json")
       cluster_count=$(echo "$clusters" | jq 'length')
 
       # Aggregate every per-cluster JSONL into a single TEST_RESULTS_FILE.
-      # Each line carries `cluster: <role>` so downstream Kusto queries can
-      # group/filter by cluster across the mesh.
+      # Each line carries `cluster: <role>` and `test_type: <name>` so
+      # downstream Kusto queries can group/filter by cluster AND scenario
+      # across the mesh.
       mkdir -p "$(dirname "$TEST_RESULTS_FILE")"
       : > "$TEST_RESULTS_FILE"
 
-      for row in $(echo "$clusters" | jq -c '.[]'); do
-        role=$(echo "$row" | jq -r '.role')
-        report_dir="${CL2_REPORT_DIR}/${role}"
-
-        if [ ! -d "$report_dir" ]; then
-          echo "##vso[task.logissue type=warning;] $role: missing report dir $report_dir, skipping"
-          continue
+      # Helper: collect one (scenario, cluster) pair. Args:
+      #   $1 scenario name (also used as test_type)
+      #   $2 cluster role
+      #   $3 per-cluster CL2 report dir (already includes scenario subdir
+      #      in share-infra mode; just <CL2_REPORT_DIR>/<role> in single
+      #      scenario mode)
+      #   $4 result file path
+      #   $5 churn_cycles value (0 to record "not a churn scenario")
+      #   $6 churn_up_duration value ("" to record "not a churn scenario")
+      #   $7 churn_down_duration value
+      #   $8 kill_duration_seconds value
+      #   $9 kill_interval_seconds value
+      #   $10 kill_batch value
+      #   $11 scenario_start_timestamp value
+      #   $12 saturation_qps_list value ("" for non-saturation scenarios)
+      #   $13 saturation_restarts_list value ("" for non-saturation scenarios)
+      collect_one() {
+        local _scen="$1" _role="$2" _report="$3" _out="$4"
+        local _cc="$5" _cu="$6" _cd="$7" _kds="$8" _kis="$9" _kb="${10}" _st="${11}"
+        local _sqps="${12:-}" _sres="${13:-}"
+        if [ ! -d "$_report" ]; then
+          echo "##vso[task.logissue type=warning;] $_scen/$_role: missing report dir $_report, skipping"
+          return 1
         fi
-
-        # If CL2 errored out before producing junit.xml (e.g. prometheus stack
-        # setup timeout), skip aggregation for this cluster — scale.py collect
-        # would crash on the missing file. The execute step already logged a
-        # warning per-cluster; we don't want to also abort the whole pipeline
-        # at collect time when partial data may be useful.
-        if [ ! -f "$report_dir/junit.xml" ]; then
-          echo "##vso[task.logissue type=warning;] $role: $report_dir/junit.xml not found (CL2 likely failed); skipping collect for this cluster"
-          continue
+        if [ ! -f "$_report/junit.xml" ]; then
+          echo "##vso[task.logissue type=warning;] $_scen/$_role: $_report/junit.xml not found (CL2 likely failed); skipping collect"
+          return 1
         fi
-
-        per_cluster_result="${TEST_RESULTS_FILE%.*}.${role}.${TEST_RESULTS_FILE##*.}"
-
+        local _rc=0
         PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \
-          --cl2_report_dir "$report_dir" \
+          --cl2_report_dir "$_report" \
           --cloud_info "${CLOUD_INFO:-}" \
           --run_id "$RUN_ID" \
           --run_url "$RUN_URL" \
-          --result_file "$per_cluster_result" \
-          --start_timestamp "$START_TIME" \
-          --cluster-name "$role" \
+          --result_file "$_out" \
+          --start_timestamp "$_st" \
+          --cluster-name "$_role" \
           --cluster-count "$cluster_count" \
           --mesh-size "$MESH_SIZE" \
-          --test_type "$TEST_TYPE" \
+          --test_type "$_scen" \
           --namespaces "$CL2_NAMESPACES" \
           --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \
           --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \
-          --trigger_reason "${TRIGGER_REASON:-}"
+          --churn-cycles "$_cc" \
+          --churn-up-duration "$_cu" \
+          --churn-down-duration "$_cd" \
+          --kill-duration-seconds "$_kds" \
+          --kill-interval-seconds "$_kis" \
+          --kill-batch "$_kb" \
+          --saturation-qps-list "$_sqps" \
+          --saturation-restarts-list "$_sres" \
+          --trigger_reason "${TRIGGER_REASON:-}" || _rc=$?
+        if [ "$_rc" -ne 0 ]; then
+          echo "##vso[task.logissue type=warning;] $_scen/$_role: scale.py collect exited $_rc; skipping aggregation"
+          return 1
+        fi
+        if [ ! -f "$_out" ]; then
+          echo "##vso[task.logissue type=warning;] $_scen/$_role: per-cluster result file $_out missing after collect; skipping"
+          return 1
+        fi
+        return 0
+      }
+
+      # Helper: set the 7 collect arg vars (cc/cu/cd/kds/kis/kb/st) for a
+      # given scenario name. For pod-churn-* scenarios, use the matrix-exported
+      # CL2_CHURN_* / CL2_KILL_* values directly. For non-churn scenarios
+      # (event-throughput, default-config), emit zeros/empties so the JSONL
+      # doesn't mis-tag those rows.
+      #
+      # Implementation note: an earlier version used `IFS=$'\t' read` to parse
+      # tab-separated values from a printf string. That was buggy because tab
+      # is whitespace-IFS and bash collapses consecutive tabs into a single
+      # delimiter — non-churn scenarios (which had empty cu/cd fields) ended
+      # up with shifted values. Direct assignment avoids that pitfall.
+      #
+      # Also sets sqps/sres for upper-bound (Scenario #6). These vars are
+      # passed to collect_one as $12/$13; saturation classifier in scale.py
+      # collect skips when sqps is empty (non-upper-bound scenarios).
+      set_churn_args_for_scenario() {
+        local _scen="$1" _st="$2"
+        case "$_scen" in
+          pod-churn-*)
+            cc="$CL2_CHURN_CYCLES"
+            cu="$CL2_CHURN_UP_DURATION"
+            cd_v="$CL2_CHURN_DOWN_DURATION"
+            kds="$CL2_KILL_DURATION_SECONDS"
+            kis="$CL2_KILL_INTERVAL_SECONDS"
+            kb="$CL2_KILL_BATCH"
+            sqps=""
+            sres=""
+            ;;
+          upper-bound)
+            cc=0
+            cu=""
+            cd_v=""
+            kds=0
+            kis=0
+            kb=0
+            sqps="$CL2_SATURATION_QPS_LIST"
+            sres="$CL2_SATURATION_RESTARTS_LIST"
+            ;;
+          *)
+            cc=0
+            cu=""
+            cd_v=""
+            kds=0
+            kis=0
+            kb=0
+            sqps=""
+            sres=""
+            ;;
+        esac
+        st="$_st"
+      }
 
-        cat "$per_cluster_result" >> "$TEST_RESULTS_FILE"
-      done
+      # Share-infra mode: SHARE_INFRA_META is a JSON array of
+      # {scenario, start_timestamp} produced by execute.yml. Iterate
+      # per-scenario × per-cluster, aggregating ALL rows into one blob with
+      # per-row test_type attribution.
+      if [ -n "${SHARE_INFRA_META:-}" ] && [ -f "$SHARE_INFRA_META" ]; then
+        echo "Share-infra collect: reading scenarios from $SHARE_INFRA_META"
+        scenarios_json=$(cat "$SHARE_INFRA_META")
+        for sn in $(echo "$scenarios_json" | jq -c '.[]'); do
+          SCENARIO=$(echo "$sn" | jq -r '.scenario')
+          SCENARIO_START=$(echo "$sn" | jq -r '.start_timestamp')
+          echo "----- collecting scenario: $SCENARIO (start=$SCENARIO_START) -----"
+          set_churn_args_for_scenario "$SCENARIO" "$SCENARIO_START"
+          for row in $(echo "$clusters" | jq -c '.[]'); do
+            role=$(echo "$row" | jq -r '.role')
+            name=$(echo "$row" | jq -r '.name')
+            report_dir="${CL2_REPORT_DIR}/${SCENARIO}/${role}"
+            per_cluster_result="${TEST_RESULTS_FILE%.*}.${SCENARIO}.${role}.${TEST_RESULTS_FILE##*.}"
+            # Phase 4b — Scenario #5 (Isolation) per-cluster churn-knob
+            # override: only the TARGET cluster's row gets actual kill knobs;
+            # peer rows stay at zeros (default). This honestly represents
+            # "kill duration/interval/batch describe what THIS cluster did",
+            # not "what the scenario was configured to do globally".
+            #
+            # The matrix-exported APISERVER_KILL_TARGET_CONTEXT (default
+            # clustermesh-1) is compared against the cluster's `name` field
+            # from the discovered-clusters JSON (AKS resource name = kubectl
+            # context name set by `az aks get-credentials`).
+            cc_row="$cc"; cu_row="$cu"; cd_row="$cd_v"
+            kds_row="$kds"; kis_row="$kis"; kb_row="$kb"
+            if [ "$SCENARIO" = "isolation" ] && [ "$name" = "$CL2_APISERVER_KILL_TARGET_CONTEXT" ]; then
+              cc_row=0
+              cu_row=""
+              cd_row=""
+              kds_row="$CL2_KILL_DURATION_SECONDS"
+              kis_row="$CL2_KILL_INTERVAL_SECONDS"
+              kb_row="$CL2_KILL_BATCH"
+            fi
+            if collect_one "$SCENARIO" "$role" "$report_dir" "$per_cluster_result" \
+                "$cc_row" "$cu_row" "$cd_row" "$kds_row" "$kis_row" "$kb_row" "$st" \
+                "$sqps" "$sres"; then
+              cat "$per_cluster_result" >> "$TEST_RESULTS_FILE"
+            fi
+          done
+        done
+      else
+        # Single-scenario mode (prod path — unchanged behavior).
+        set_churn_args_for_scenario "$TEST_TYPE" "$START_TIME"
+        for row in $(echo "$clusters" | jq -c '.[]'); do
+          role=$(echo "$row" | jq -r '.role')
+          report_dir="${CL2_REPORT_DIR}/${role}"
+          per_cluster_result="${TEST_RESULTS_FILE%.*}.${role}.${TEST_RESULTS_FILE##*.}"
+          if collect_one "$TEST_TYPE" "$role" "$report_dir" "$per_cluster_result" \
+              "$cc" "$cu" "$cd_v" "$kds" "$kis" "$kb" "$st" \
+              "$sqps" "$sres"; then
+            cat "$per_cluster_result" >> "$TEST_RESULTS_FILE"
+          fi
+        done
+      fi
 
       echo "Aggregated results from $cluster_count clusters into $TEST_RESULTS_FILE"
       wc -l "$TEST_RESULTS_FILE" || true
diff --git a/steps/engine/clusterloader2/clustermesh-scale/execute.yml b/steps/engine/clusterloader2/clustermesh-scale/execute.yml
index cd82bc2d70..fc99f552aa 100644
--- a/steps/engine/clusterloader2/clustermesh-scale/execute.yml
+++ b/steps/engine/clusterloader2/clustermesh-scale/execute.yml
@@ -40,6 +40,48 @@ steps:
       export CL2_HOLD_DURATION="$HOLD_DURATION"
       export CL2_WARMUP_DURATION="$WARMUP_DURATION"
       export CL2_RESTART_GENERATION="$RESTART_COUNT"
+      # Phase 4a — Scenario #2 (Pod Churn Stress) knobs. Shell defaults so
+      # matrix entries that don't set these (event-throughput, default-config)
+      # silently fall back to the documented Phase 4a defaults rather than
+      # passing empty strings to argparse type=int. Pod-churn matrix entries
+      # set these explicitly via auto-exported uppercase matrix vars.
+      export CL2_CHURN_CYCLES="${CHURN_CYCLES:-5}"
+      export CL2_CHURN_UP_DURATION="${CHURN_UP_DURATION:-60s}"
+      export CL2_CHURN_DOWN_DURATION="${CHURN_DOWN_DURATION:-60s}"
+      export CL2_KILL_DURATION="${KILL_DURATION:-10m}"
+      export CL2_KILL_INTERVAL_SECONDS="${KILL_INTERVAL_SECONDS:-10}"
+      export CL2_KILL_BATCH="${KILL_BATCH:-5}"
+      export CL2_KILL_DURATION_SECONDS="${KILL_DURATION_SECONDS:-600}"
+      export CL2_KILL_JOB_DEADLINE_SECONDS="${KILL_JOB_DEADLINE_SECONDS:-660}"
+      # Phase 4b — Scenario #4 (ClusterMesh APIServer Failure) knobs.
+      export CL2_APISERVER_KILL_TARGET_CONTEXT="${APISERVER_KILL_TARGET_CONTEXT:-clustermesh-1}"
+      export CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS="${APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS:-240}"
+      export CL2_APISERVER_KILL_OBSERVATION_SECONDS="${APISERVER_KILL_OBSERVATION_SECONDS:-60}"
+      # Phase 4b — Scenario #7 (HA Configuration Validation) knob.
+      export CL2_HA_CONFIG_REPLICAS="${HA_CONFIG_REPLICAS:-3}"
+      # Phase 4b — Scenario #3 (Node Churn / IP Churn) knobs.
+      # node-churner.sh (driven from this script, NOT Method:Exec — see
+      # config/node-churner.sh header for the design rationale) consumes
+      # these directly. scale.py configure also writes them into overrides.yaml
+      # so CL2 templates that reference CL2_NODE_CHURN_* can use them.
+      export CL2_NODE_CHURN_TARGET_CONTEXT="${NODE_CHURN_TARGET_CONTEXT:-${APISERVER_KILL_TARGET_CONTEXT:-clustermesh-1}}"
+      export CL2_NODE_CHURN_CYCLES="${NODE_CHURN_CYCLES:-3}"
+      export CL2_NODE_CHURN_DELTA="${NODE_CHURN_DELTA:-5}"
+      export CL2_NODE_CHURN_SETTLE_SECONDS="${NODE_CHURN_SETTLE_SECONDS:-60}"
+      export CL2_NODE_CHURN_SCALE_DURATION_SECONDS="${NODE_CHURN_SCALE_DURATION_SECONDS:-1800}"
+      export CL2_NODE_CHURN_REPLACE_DURATION_SECONDS="${NODE_CHURN_REPLACE_DURATION_SECONDS:-1500}"
+      export CL2_NODE_CHURN_COMBINED_DURATION_SECONDS="${NODE_CHURN_COMBINED_DURATION_SECONDS:-3300}"
+      export CL2_NODE_REPLACE_BATCH_SIZE="${NODE_REPLACE_BATCH_SIZE:-10}"
+      export CL2_NODE_CHURN_READY_TIMEOUT_SECONDS="${NODE_CHURN_READY_TIMEOUT_SECONDS:-300}"
+      # Phase 4b — Scenario #6 (Upper Bound / Saturation) knobs.
+      # upper-bound.yaml consumes these via CL2's DefaultParam template
+      # func; non-saturation scenarios ignore them. Defaults mirror
+      # scale.py configure's defaults so a forgotten matrix var falls
+      # through to the documented 5-rung sweep at 100/500/1500/4000/10000 QPS.
+      export CL2_SATURATION_QPS_LIST="${SATURATION_QPS_LIST:-100,500,1500,4000,10000}"
+      export CL2_SATURATION_RESTARTS_LIST="${SATURATION_RESTARTS_LIST:-2,4,8,15,25}"
+      export CL2_SATURATION_RUNG_DURATION_SECONDS="${SATURATION_RUNG_DURATION_SECONDS:-240}"
+      export CL2_SATURATION_SETTLE_SECONDS="${SATURATION_SETTLE_SECONDS:-90}"
 
       # Same discovery pattern as topology/clustermesh-scale/validate-resources.yml.
       # We re-run it here rather than relying on a step variable so this engine
@@ -58,7 +100,25 @@ steps:
 
       echo "Running CL2 across $cluster_count clusters"
       mkdir -p "$HOME/.kube"
-      echo "$clusters" > "$HOME/.kube/clustermesh-clusters.json"
+      # Pre-fetch all kubeconfigs sequentially. This is fast (<5s/cluster) and
+      # keeps the parallel CL2 fan-out below from racing on `az aks
+      # get-credentials` writes to ~/.azure (MSAL token cache shared across
+      # all subsequent CL2 docker containers).
+      for row in $(echo "$clusters" | jq -c '.[]'); do
+        name=$(echo "$row" | jq -r '.name')
+        rg=$(echo "$row"   | jq -r '.rg')
+        role=$(echo "$row" | jq -r '.role')
+        kubeconfig="$HOME/.kube/$role.config"
+        KUBECONFIG="$kubeconfig" az aks get-credentials \
+          --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors
+      done
+
+      # Augment clusters JSON with the per-cluster kubeconfig path, then write
+      # the file consumed by both this step (for parallel fan-out) and
+      # collect.yml (which only reads role/name/rg and ignores extra fields).
+      clusters_with_kubeconfig=$(echo "$clusters" | jq --arg home "$HOME" \
+        '[.[] | . + {kubeconfig: ($home + "/.kube/" + .role + ".config")}]')
+      echo "$clusters_with_kubeconfig" > "$HOME/.kube/clustermesh-clusters.json"
       echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$cluster_count"
 
       # CL2 overrides are written once — params are identical for every cluster
@@ -68,128 +128,609 @@ steps:
         --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \
         --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \
         --operation-timeout "${CL2_OPERATION_TIMEOUT:-15m}" \
+        --churn-cycles "$CL2_CHURN_CYCLES" \
+        --churn-up-duration "$CL2_CHURN_UP_DURATION" \
+        --churn-down-duration "$CL2_CHURN_DOWN_DURATION" \
+        --kill-duration "$CL2_KILL_DURATION" \
+        --kill-interval-seconds "$CL2_KILL_INTERVAL_SECONDS" \
+        --kill-batch "$CL2_KILL_BATCH" \
+        --kill-duration-seconds "$CL2_KILL_DURATION_SECONDS" \
+        --kill-job-deadline-seconds "$CL2_KILL_JOB_DEADLINE_SECONDS" \
+        --apiserver-kill-target-context "$CL2_APISERVER_KILL_TARGET_CONTEXT" \
+        --apiserver-kill-recovery-timeout-seconds "$CL2_APISERVER_KILL_RECOVERY_TIMEOUT_SECONDS" \
+        --apiserver-kill-observation-seconds "$CL2_APISERVER_KILL_OBSERVATION_SECONDS" \
+        --ha-config-replicas "$CL2_HA_CONFIG_REPLICAS" \
+        --node-churn-target-context "$CL2_NODE_CHURN_TARGET_CONTEXT" \
+        --node-churn-cycles "$CL2_NODE_CHURN_CYCLES" \
+        --node-churn-delta "$CL2_NODE_CHURN_DELTA" \
+        --node-churn-settle-seconds "$CL2_NODE_CHURN_SETTLE_SECONDS" \
+        --node-churn-scale-duration-seconds "$CL2_NODE_CHURN_SCALE_DURATION_SECONDS" \
+        --node-churn-replace-duration-seconds "$CL2_NODE_CHURN_REPLACE_DURATION_SECONDS" \
+        --node-churn-combined-duration-seconds "$CL2_NODE_CHURN_COMBINED_DURATION_SECONDS" \
+        --node-replace-batch-size "$CL2_NODE_REPLACE_BATCH_SIZE" \
+        --node-churn-ready-timeout-seconds "$CL2_NODE_CHURN_READY_TIMEOUT_SECONDS" \
+        --saturation-qps-list "$CL2_SATURATION_QPS_LIST" \
+        --saturation-restarts-list "$CL2_SATURATION_RESTARTS_LIST" \
+        --saturation-rung-duration-seconds "$CL2_SATURATION_RUNG_DURATION_SECONDS" \
+        --saturation-settle-seconds "$CL2_SATURATION_SETTLE_SECONDS" \
         --cl2_override_file "${CL2_CONFIG_DIR}/overrides.yaml"
 
-      # Per-cluster CL2 fan-out — sequential. Each invocation writes its own
-      # report dir at ${CL2_REPORT_DIR}/<role>/, so collect.yml can iterate the
-      # same way and tag results with --cluster-name.
-      failures=0
-      for row in $(echo "$clusters" | jq -c '.[]'); do
-        name=$(echo "$row" | jq -r '.name')
-        rg=$(echo "$row"   | jq -r '.rg')
-        role=$(echo "$row" | jq -r '.role')
+      # Phase 4a — pre-stage kubectl into the CL2 config dir so the
+      # pod-churn-killer.sh script (invoked via Method: Exec from inside
+      # the CL2 docker container) has a working kubectl binary regardless
+      # of whether the CL2 image bundles one. The cl2_config_dir is
+      # bind-mounted by run_cl2_command at /root/perf-tests/clusterloader2/config,
+      # so $CL2_CONFIG_DIR/kubectl on the host becomes accessible at
+      # /root/perf-tests/clusterloader2/config/kubectl inside the container.
+      #
+      # Why this lives in execute.yml rather than the Dockerfile: we don't
+      # control the CL2 image build (ghcr.io/azure/clusterloader2). Method:
+      # Exec is the only host-side hook CL2 exposes inside a test run.
+      # AzDO agents have curl + internet egress to dl.k8s.io (Kubernetes'
+      # canonical release host).
+      #
+      # Non-fatal: a curl failure here logs a warning but does NOT abort
+      # the step. pod-churn-killer.sh's preflight check exits 127 if the
+      # binary is missing, which CL2 records as a single measurement
+      # failure — scale-cycle data still lands cleanly.
+      if [ ! -x "${CL2_CONFIG_DIR}/kubectl" ]; then
+        KUBECTL_VERSION="${KUBECTL_VERSION:-v1.30.0}"
+        echo "Pre-staging kubectl ${KUBECTL_VERSION} for in-container use by Method: Exec scripts"
+        if curl -sfL -o "${CL2_CONFIG_DIR}/kubectl" \
+            "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"; then
+          chmod 0755 "${CL2_CONFIG_DIR}/kubectl"
+          "${CL2_CONFIG_DIR}/kubectl" version --client=true --output=yaml | head -3 || true
+        else
+          echo "##vso[task.logissue type=warning;] kubectl pre-stage download failed; pod-churn kill phase will fail-soft (script's fallback path)"
+          rm -f "${CL2_CONFIG_DIR}/kubectl"
+        fi
+      fi
 
-        echo "===================================================================="
-        echo "  Running CL2 on $role ($name)"
-        echo "===================================================================="
+      # Phase 4b — pre-pull CL2 docker image ONCE on the agent before
+      # parallel fan-out. Without this, scale.py execute-parallel spawns up
+      # to CL2_MAX_CONCURRENT (default 4, dev pipeline 8) `docker run`
+      # commands simultaneously, each of which independently pulls
+      # ghcr.io/azure/clusterloader2:<tag>. The parallel pull race against
+      # ghcr.io's anonymous-rate limit caused mesh-13's CL2 step to fail
+      # in build 67013 with `context deadline exceeded` on the token
+      # endpoint. Pre-pulling once means the parallel `docker run`s see
+      # the image cached locally and skip the pull entirely.
+      #
+      # Best-effort: `docker pull` failure here triggers a warning + lets
+      # the parallel-fanout retry on its own. Most runs will benefit from
+      # the cache hit; failures behave no worse than before.
+      echo "Pre-pulling CL2 image ${CL2_IMAGE} on the AzDO agent (sidesteps ghcr.io rate-limit race during parallel fanout)..."
+      if docker pull "${CL2_IMAGE}" 2>&1 | tail -5; then
+        echo "Pre-pull succeeded; subsequent docker runs will hit local cache"
+      else
+        echo "##vso[task.logissue type=warning;] CL2 image pre-pull failed; per-cluster CL2 invocations will each attempt their own pull (ghcr.io rate-limit risk persists)"
+      fi
 
-        kubeconfig="$HOME/.kube/$role.config"
-        KUBECONFIG="$kubeconfig" az aks get-credentials \
-          --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors
+      # CL2 execution: single-scenario (default, prod path) or share-infra
+      # multi-scenario loop (dev pipeline iteration). See plan.md Phase 4b
+      # section for the design rationale.
+      #
+      # Gating env var SHARE_INFRA_SCENARIOS — comma-separated list of CL2
+      # config basenames (e.g. "event-throughput,pod-churn-combined"). When
+      # set, each entry runs sequentially against the same provisioned
+      # clusters with a 60s settle between scenarios. test_type per row in
+      # the JSONL is each scenario's own basename. When unset, fall through
+      # to the single-scenario invocation that prod pipeline expects.
+      overall_rc=0
 
-        report_dir="${CL2_REPORT_DIR}/${role}"
-        mkdir -p "$report_dir"
-
-        cl2_passed=0
-        # Run CL2; collect outcome WITHOUT failing the bash script (so we can
-        # also inspect junit.xml for internal test failures even when CL2 exits
-        # 0). Treat as "passed" only if BOTH:
-        #   (a) junit.xml exists (CL2 actually completed and wrote a report)
-        #   (b) junit.xml has zero <failure>/<error> elements
-        # Without (b) we'd silently green-light runs where measurements failed
-        # — e.g. PodMonitor template substitution producing "<no value>", which
-        # k8s admission rejects but CL2 still writes junit with <failure> tags.
-        PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \
-            --cl2-image "${CL2_IMAGE}" \
-            --cl2-config-dir "${CL2_CONFIG_DIR}" \
-            --cl2-report-dir "$report_dir" \
-            --cl2-config-file "${CL2_CONFIG_FILE}" \
-            --kubeconfig "$kubeconfig" \
-            --provider "${CLOUD}" \
-          || true
-        if [ -f "$report_dir/junit.xml" ]; then
-          # Count failure/error attrs from <testsuite ... failures="N" errors="M">.
-          junit_failures=$(grep -oE 'failures="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0)
-          junit_errors=$(grep -oE 'errors="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0)
-          junit_failures=${junit_failures:-0}
-          junit_errors=${junit_errors:-0}
-          if [ "$junit_failures" -eq 0 ] && [ "$junit_errors" -eq 0 ]; then
-            cl2_passed=1
-          else
-            echo "##vso[task.logissue type=warning;] $role: junit.xml reports failures=$junit_failures errors=$junit_errors"
-          fi
+      # Scenarios that REQUIRE every cluster's CL2 (and its Prometheus
+      # scrape window) to overlap the target's stimulus window — bumping
+      # max_concurrent to mesh_size means all clusters start CL2
+      # simultaneously. Used for:
+      #   - isolation:        target's pod-churn kill loop runs ON target;
+      #                       peer Prometheus must scrape concurrently to
+      #                       prove peers stay flat.
+      #   - node-churn-*:     stimulus is OUTSIDE CL2 (host-side az aks
+      #                       nodepool scale / vmss delete-instances). The
+      #                       readiness barrier in node-churner.sh requires
+      #                       all clusters' CL2 sentinels to land before
+      #                       node ops start — that's only possible if all
+      #                       CL2's are running concurrently.
+      #   - upper-bound:      saturation testing measures per-cluster
+      #                       failure point under aggregate mesh load. If
+      #                       peers don't load concurrently, each cluster's
+      #                       reading understates the real saturation curve
+      #                       (mesh-wide propagation is a function of N×load,
+      #                       not load×1). Plus: the in-run rung loop is
+      #                       not coordinated across clusters — we accept
+      #                       that rung-r on cluster A may overlap rung-(r±1)
+      #                       on cluster B in wall-time; the per-rung
+      #                       suffix in measurement filenames keeps the
+      #                       data attribution clean.
+      needs_mesh_wide_concurrency() {
+        local _scen="$1"
+        case "$_scen" in
+          isolation|node-churn-scale|node-churn-replace|node-churn-combined|upper-bound)
+            return 0
+            ;;
+        esac
+        return 1
+      }
+
+      # Scenarios that drive their stimulus via node-churner.sh on the AzDO
+      # agent (NOT Method:Exec). The launcher returns the PID; the caller
+      # `wait`s after execute-parallel completes so the timing file is
+      # finalized before collect runs.
+      is_node_churn_scenario() {
+        case "$1" in
+          node-churn-scale|node-churn-replace|node-churn-combined) return 0 ;;
+        esac
+        return 1
+      }
+
+      # Scenario #6 (Upper Bound / Saturation) predicate. Used to gate the
+      # proactive failure-diag dump (runs unconditionally for upper-bound
+      # like for node-churn, NOT just on rc!=0). User direction 2026-05-14:
+      # be proactive about debug dumps until scenario is end-to-end green;
+      # remove the unconditional gate once the first n=2 + n=20 are clean.
+      is_upper_bound_scenario() {
+        case "$1" in
+          upper-bound) return 0 ;;
+        esac
+        return 1
+      }
+
+      # Sentinel dir bind-mounted into every CL2 container at
+      # /root/perf-tests/clusterloader2/config/sentinels (CL2_CONFIG_DIR is
+      # bind-mounted at /root/perf-tests/clusterloader2/config). Per-cluster
+      # CL2 writes ready-<cluster_context> when it enters the measurement
+      # window; node-churner.sh polls for $cluster_count sentinel files
+      # before starting the first nodepool op. Cleared per scenario so
+      # stale sentinels from a previous scenario don't fool the barrier.
+      SENTINEL_DIR="${CL2_CONFIG_DIR}/sentinels"
+      mkdir -p "$SENTINEL_DIR"
+
+      # Launch node-churner.sh for the named scenario; populates
+      # NODE_CHURNER_PID. Caller must:
+      #   - mkdir -p the per-cluster target report dir BEFORE calling so
+      #     the churner has a writable place for NodeChurnTimings_*.json
+      #   - call `wait $NODE_CHURNER_PID` after execute-parallel returns
+      #   - unset NODE_CHURNER_PID after wait
+      launch_node_churner() {
+        local _scen="$1" _report_dir_base="$2"
+        # Discover target cluster + kubeconfig from the augmented clusters
+        # JSON written to $HOME/.kube/clustermesh-clusters.json. The shell
+        # `$clusters` var in this script is the EARLY discovery output
+        # WITHOUT the kubeconfig field; using it here gave node-churner an
+        # empty TARGET_KUBECONFIG arg in build 67126.
+        local _all _target_role _target_row
+        _all=$(cat "$HOME/.kube/clustermesh-clusters.json" 2>/dev/null || echo "[]")
+        _target_role="${CL2_NODE_CHURN_TARGET_CONTEXT}"
+        # Map role → AKS name + RG. Our tfvars set aks_name == role-derived
+        # name (e.g., role=mesh-1 → name=clustermesh-1), and `az aks
+        # get-credentials` writes kubectl context = AKS name. So
+        # CL2_NODE_CHURN_TARGET_CONTEXT is the AKS cluster name.
+        _target_row=$(echo "$_all" | jq -c --arg n "$_target_role" '.[] | select(.name==$n)')
+        if [ -z "$_target_row" ]; then
+          # Fallback: maybe the user set NODE_CHURN_TARGET_CONTEXT to a role.
+          _target_row=$(echo "$_all" | jq -c --arg r "$_target_role" '.[] | select(.role==$r)')
+        fi
+        if [ -z "$_target_row" ]; then
+          echo "##vso[task.logissue type=warning;] node-churner: target cluster '${_target_role}' not found in discovered clusters; skipping scenario stimulus"
+          NODE_CHURNER_PID=""
+          return 0
         fi
+        local _target_name _target_rg _target_role_field _target_kubeconfig
+        _target_name=$(echo "$_target_row" | jq -r '.name')
+        _target_rg=$(echo "$_target_row" | jq -r '.rg')
+        _target_role_field=$(echo "$_target_row" | jq -r '.role')
+        _target_kubeconfig=$(echo "$_target_row" | jq -r '.kubeconfig // ""')
+
+        # Per-scenario expected duration (matches the CL2 sleep window).
+        local _expected_dur
+        case "$_scen" in
+          node-churn-scale)    _expected_dur="$CL2_NODE_CHURN_SCALE_DURATION_SECONDS" ;;
+          node-churn-replace)  _expected_dur="$CL2_NODE_CHURN_REPLACE_DURATION_SECONDS" ;;
+          node-churn-combined) _expected_dur="$CL2_NODE_CHURN_COMBINED_DURATION_SECONDS" ;;
+          *)                   _expected_dur=1500 ;;
+        esac
+
+        # Clear sentinels for THIS scenario so the prior scenario's
+        # leftovers (if any) don't pre-trigger the barrier.
+        rm -f "$SENTINEL_DIR"/ready-* 2>/dev/null || true
 
-        if [ "$cl2_passed" -eq 1 ]; then
-          echo "  $role: CL2 run succeeded"
+        # Target report dir for NodeChurnTimings_*.json. Pre-create so
+        # node-churner.sh can write even before CL2 finishes for that
+        # cluster (CL2 lazy-creates report dirs).
+        local _target_report_dir="${_report_dir_base}/${_target_role_field}"
+        mkdir -p "$_target_report_dir"
+
+        local _churner_log="${_target_report_dir}/node-churner.log"
+        echo "===== node-churner launch: scenario=${_scen} target=${_target_name} rg=${_target_rg} =====" | tee -a "$_churner_log"
+
+        # Background subshell. The churner's EXIT trap restores the pool to
+        # original count regardless of how the script exits; finalizer
+        # outcome (cleanup_failed) lands in the timing JSON.
+        (
+          bash "$NODE_CHURNER_SCRIPT" \
+            "$_scen" \
+            "$_target_name" \
+            "$_target_rg" \
+            "default" \
+            "$_target_report_dir" \
+            "$SENTINEL_DIR" \
+            "$cluster_count" \
+            "$CL2_NODE_CHURN_CYCLES" \
+            "$CL2_NODE_CHURN_DELTA" \
+            "$CL2_NODE_CHURN_SETTLE_SECONDS" \
+            "$CL2_NODE_REPLACE_BATCH_SIZE" \
+            "$CL2_NODE_CHURN_READY_TIMEOUT_SECONDS" \
+            "$_expected_dur" \
+            "$_target_kubeconfig" 2>&1 | tee -a "$_churner_log"
+        ) &
+        NODE_CHURNER_PID=$!
+        echo "node-churner: launched PID=$NODE_CHURNER_PID for scenario=${_scen}; log=${_churner_log}"
+      }
+
+      # Wait helper — caller invokes after execute-parallel returns.
+      wait_node_churner() {
+        local _scen="$1"
+        if [ -z "${NODE_CHURNER_PID:-}" ]; then
+          return 0
         fi
+        echo "node-churner: waiting on PID=$NODE_CHURNER_PID for scenario=${_scen}"
+        local _rc=0
+        wait "$NODE_CHURNER_PID" || _rc=$?
+        if [ "$_rc" -ne 0 ]; then
+          echo "##vso[task.logissue type=warning;] node-churner: scenario=${_scen} exited rc=${_rc}; check NodeChurnTimings_*.json for scenario_valid / cleanup_failed flags"
+        fi
+        NODE_CHURNER_PID=""
+        return 0
+      }
+
+      # Proactive failure-time debug dump — runs after every scenario
+      # (always for node-churn; on rc!=0 for others). Writes diagnostic
+      # state to <report_dir>/_debug/scenario-diag-<scenario>.log so
+      # postmortem doesn't depend on AzDO retaining stdout. Captures:
+      #   - per-cluster `kubectl get nodes` (Ready state, IPs)
+      #   - per-cluster `kubectl -n kube-system get pods` (mesh + workload pods)
+      #   - per-cluster `cilium clustermesh status` (mesh health)
+      #   - clusters JSON snapshot
+      #   - share-infra meta snapshot
+      #   - node-churner.log + NodeChurnTimings_*.json contents (for node-churn)
+      # User direction 2026-05-14: assume failure; keep this dump baked
+      # in until end-to-end node-churn is green.
+      scenario_failure_diag() {
+        local _scen="$1" _rc="${2:-0}"
+        local _diag_dir="${CL2_REPORT_DIR}/_debug"
+        mkdir -p "$_diag_dir"
+        local _diag_log="${_diag_dir}/scenario-diag-${_scen}.log"
+        # Read augmented clusters JSON (has kubeconfig field) — the shell
+        # `$clusters` var earlier in this script is the EARLY discovery
+        # output WITHOUT kubeconfig. Build 67126 regression: using
+        # `$clusters` here caused _kc=null → kubectl context errors.
+        local _clusters_with_kc
+        _clusters_with_kc=$(cat "$HOME/.kube/clustermesh-clusters.json" 2>/dev/null || echo "[]")
+        {
+          echo "================================================================"
+          echo "=== scenario-failure-diag: scenario=${_scen} rc=${_rc}"
+          echo "=== timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+          echo "================================================================"
+          echo ""
+          echo "-- clusters JSON (kubeconfig-augmented) --"
+          echo "$_clusters_with_kc" | jq . 2>&1 || echo "$_clusters_with_kc"
+          echo ""
+          if [ -f "${SHARE_INFRA_META:-/nonexistent}" ]; then
+            echo "-- share-infra meta --"
+            jq . "$SHARE_INFRA_META" 2>&1 || cat "$SHARE_INFRA_META"
+            echo ""
+          fi
+          echo "-- per-cluster state --"
+          for _row in $(echo "$_clusters_with_kc" | jq -c '.[]'); do
+            local _role _name _kc
+            _role=$(echo "$_row" | jq -r '.role')
+            _name=$(echo "$_row" | jq -r '.name')
+            _kc=$(echo "$_row" | jq -r '.kubeconfig')
+            echo "--- cluster ${_role} (${_name}, kubeconfig=${_kc}) ---"
+            if [ ! -f "$_kc" ]; then
+              echo "(kubeconfig file missing: ${_kc})"
+              continue
+            fi
+            echo "-- nodes --"
+            KUBECONFIG="$_kc" kubectl --context "$_name" get nodes -o wide 2>&1 | head -40 || echo "(kubectl get nodes failed)"
+            echo "-- nodes providerID --"
+            KUBECONFIG="$_kc" kubectl --context "$_name" get nodes \
+              -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.spec.providerID}{"\n"}{end}' 2>&1 | head -40 || true
+            echo "-- kube-system pods (clustermesh/cilium) --"
+            KUBECONFIG="$_kc" kubectl --context "$_name" -n kube-system get pods \
+              -l 'k8s-app in (clustermesh-apiserver,cilium)' -o wide 2>&1 | head -20 || true
+            echo "-- recent kube-system events --"
+            KUBECONFIG="$_kc" kubectl --context "$_name" -n kube-system get events \
+              --sort-by=.lastTimestamp 2>&1 | tail -20 || true
+            echo ""
+          done
+          echo "-- sentinel dir contents (${SENTINEL_DIR:-unset}) --"
+          ls -la "${SENTINEL_DIR:-/nonexistent}" 2>&1 || echo "(sentinel dir missing)"
+          echo ""
+          if is_node_churn_scenario "$_scen"; then
+            echo "-- node-churn timing files + logs --"
+            find "${CL2_REPORT_DIR}/${_scen}" -name 'NodeChurnTimings_*.json' \
+              -o -name 'node-churner*.log' 2>/dev/null | while IFS= read -r _f; do
+              echo "--- ${_f} ---"
+              cat "$_f" 2>&1 || true
+              echo ""
+            done
+          fi
+          if is_upper_bound_scenario "$_scen"; then
+            echo "-- upper-bound scenario state --"
+            echo "-- CL2_SATURATION_* env (as passed into CL2) --"
+            env | grep -E '^CL2_SATURATION_' 2>&1 || echo "(no CL2_SATURATION_* env vars)"
+            echo ""
+            echo "-- rendered overrides.yaml (CL2 sees this — verifies scale.py configure landed the saturation knobs) --"
+            if [ -f "${CL2_CONFIG_DIR}/overrides.yaml" ]; then
+              grep -E '^CL2_(SATURATION|NAMESPACES|DEPLOYMENTS|REPLICAS)' "${CL2_CONFIG_DIR}/overrides.yaml" 2>&1 || true
+            else
+              echo "(${CL2_CONFIG_DIR}/overrides.yaml does not exist)"
+            fi
+            echo ""
+            # Per-cluster: which rung measurement files made it to disk?
+            # If a rung is missing entirely, the classifier flags rung_completed=false;
+            # this dump tells postmortem WHY (e.g. CL2 timed out mid-rung,
+            # Prometheus pod was Pending, restart-burst hung).
+            for _row in $(echo "$_clusters_with_kc" | jq -c '.[]'); do
+              local _role _name _kc
+              _role=$(echo "$_row" | jq -r '.role')
+              _name=$(echo "$_row" | jq -r '.name')
+              _kc=$(echo "$_row" | jq -r '.kubeconfig')
+              # Single-scenario mode: report dir is <CL2_REPORT_DIR>/<role>.
+              # Share-infra mode: <CL2_REPORT_DIR>/<scenario>/<role>. Try both.
+              local _report_dir="${CL2_REPORT_DIR}/${_scen}/${_role}"
+              if [ ! -d "$_report_dir" ]; then
+                _report_dir="${CL2_REPORT_DIR}/${_role}"
+              fi
+              echo "--- cluster ${_role} (${_name}) report dir: ${_report_dir} ---"
+              echo "-- per-rung measurement file counts --"
+              for _rung in 0 1 2 3 4 5 6 7; do
+                # CL2 emits filenames like "GenericPrometheusQuery <metricName> Rung<N>_<group>_<ts>.json"
+                # with a SPACE between method and metric name (build 67211 verified).
+                # Match both space and legacy underscore conventions via "GenericPrometheusQuery*".
+                local _count
+                _count=$(find "${_report_dir}" -maxdepth 1 -name "GenericPrometheusQuery*Rung${_rung}_*.json" 2>/dev/null | wc -l)
+                if [ "$_count" -gt 0 ]; then
+                  echo "  Rung${_rung}: ${_count} measurement files"
+                fi
+              done
+              echo "-- junit.xml (CL2 phase pass/fail per rung) --"
+              if [ -f "${_report_dir}/junit.xml" ]; then
+                head -200 "${_report_dir}/junit.xml" 2>&1 || true
+              else
+                echo "(no junit.xml — CL2 likely failed before gathering measurements)"
+              fi
+              echo "-- monitoring/prometheus pod status (saturation can OOM Prom) --"
+              if [ -f "$_kc" ]; then
+                KUBECONFIG="$_kc" kubectl --context "$_name" -n monitoring get pods \
+                  -o wide 2>&1 | head -20 || echo "(kubectl get pods -n monitoring failed)"
+                echo "-- clustermesh-apiserver pod resource state (OOM/Restart signals) --"
+                KUBECONFIG="$_kc" kubectl --context "$_name" -n kube-system describe pod \
+                  -l 'k8s-app=clustermesh-apiserver' 2>&1 \
+                  | grep -E 'OOMKilled|Last State|Restart Count|Ready:' \
+                  | head -30 || true
+              else
+                echo "(kubeconfig missing: ${_kc})"
+              fi
+              echo ""
+            done
+          fi
+          echo "=== end scenario-failure-diag ==="
+        } 2>&1 | tee -a "$_diag_log"
+        echo "scenario-failure-diag: wrote ${_diag_log}"
+      }
+
+      if [ -n "${SHARE_INFRA_SCENARIOS:-}" ]; then
+        # Trim whitespace from each entry, split on comma.
+        IFS=',' read -ra SCENARIO_LIST <<< "$SHARE_INFRA_SCENARIOS"
+        for i in "${!SCENARIO_LIST[@]}"; do
+          SCENARIO_LIST[$i]="$(echo "${SCENARIO_LIST[$i]}" | xargs)"
+        done
 
-        # Always-on log capture (spec line 35: "Logs: clustermesh-apiserver,
-        # agent watchers"). Files land in $report_dir/logs/ so they are
-        # uploaded alongside junit.xml + measurement results when the
-        # publish step runs. The same files double as immediate
-        # diagnostics for failed runs (see FAILURE DIAG block below).
-        log_dir="$report_dir/logs"
-        mkdir -p "$log_dir"
-        echo "------- $role: capturing pod logs to $log_dir -------"
-        # clustermesh-apiserver: all three containers (apiserver / etcd /
-        # kvstoremesh) — bounded tail, single pod expected.
-        for c in apiserver etcd kvstoremesh; do
-          KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
-            -l k8s-app=clustermesh-apiserver -c "$c" --tail=4000 \
-            > "$log_dir/clustermesh-apiserver-$c.log" 2>&1 || true
+        # Pre-validate: non-empty, all referenced config files exist, no
+        # blanks (catches trailing commas, whitespace-only entries).
+        if [ "${#SCENARIO_LIST[@]}" -eq 0 ]; then
+          echo "##vso[task.logissue type=error;] SHARE_INFRA_SCENARIOS is set but empty after split"
+          exit 1
+        fi
+        for s in "${SCENARIO_LIST[@]}"; do
+          if [ -z "$s" ]; then
+            echo "##vso[task.logissue type=error;] SHARE_INFRA_SCENARIOS contains empty entry; got: '$SHARE_INFRA_SCENARIOS'"
+            exit 1
+          fi
+          if [ ! -f "${CL2_CONFIG_DIR}/${s}.yaml" ]; then
+            echo "##vso[task.logissue type=error;] CL2 config file not found: ${CL2_CONFIG_DIR}/${s}.yaml (from SHARE_INFRA_SCENARIOS=$SHARE_INFRA_SCENARIOS)"
+            exit 1
+          fi
         done
-        # cilium-agent: one pod per node — keep tail small to bound size.
-        KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
-          -l k8s-app=cilium --tail=1000 --prefix=true \
-          > "$log_dir/cilium-agent.log" 2>&1 || true
-        # cilium-operator: low-volume control plane.
-        KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
-          -l io.cilium/app=operator --tail=2000 --prefix=true \
-          > "$log_dir/cilium-operator.log" 2>&1 || true
-
-        if [ "$cl2_passed" -ne 1 ]; then
-          # Dump enough state to distinguish prometheus-stack scheduling
-          # failures from CL2 logic failures. Prometheus is the most common
-          # culprit here — its pod requests 10Gi by default, doesn't fit on
-          # Standard_D4s_v4. If the pod is Pending with FailedScheduling, the
-          # describe events make that obvious.
+
+        # Persist the validated scenario list + per-scenario start timestamps
+        # for downstream collect.yml. Written to the kubeconfig dir alongside
+        # clustermesh-clusters.json so it's deterministically discoverable.
+        SHARE_INFRA_META="$HOME/.kube/share-infra-meta.json"
+        echo "[]" > "$SHARE_INFRA_META"
+
+        echo "============================================="
+        echo "Share-infra mode: ${#SCENARIO_LIST[@]} scenarios in this lifecycle: ${SCENARIO_LIST[*]}"
+        echo "============================================="
+
+        for i in "${!SCENARIO_LIST[@]}"; do
+          SCENARIO="${SCENARIO_LIST[$i]}"
+          scenario_idx=$((i + 1))
+          echo "============================================="
+          echo "Scenario [${scenario_idx}/${#SCENARIO_LIST[@]}]: ${SCENARIO}"
+          echo "============================================="
+          scenario_start=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+          jq --arg name "$SCENARIO" --arg start "$scenario_start" \
+            '. += [{"scenario": $name, "start_timestamp": $start}]' \
+            "$SHARE_INFRA_META" > "${SHARE_INFRA_META}.tmp" && mv "${SHARE_INFRA_META}.tmp" "$SHARE_INFRA_META"
+
+          # Per-scenario report dir so collect.yml can iterate per-scenario.
+          # tear_down_prometheus=True so each scenario gets a clean Prom deploy
+          # (rather than colliding with the previous scenario's leftover
+          # PodMonitor + scrape config).
           #
-          # Note: scale.py passes tear_down_prometheus=False so the stack
-          # survives this dump (otherwise CL2 would clean up before we look).
-          echo "------- $role: CL2 FAILURE DIAG -------"
-          echo "------- node allocatable / requested capacity -------"
-          KUBECONFIG="$kubeconfig" kubectl get nodes -o wide 2>&1 || true
-          KUBECONFIG="$kubeconfig" kubectl describe nodes 2>&1 | grep -A 4 "Allocatable\|Allocated resources" | head -40 || true
+          # Per-scenario max_concurrent override:
+          # The isolation scenario REQUIRES every peer cluster's Prometheus
+          # window to overlap the target's 10min churn window — otherwise
+          # peers in later batches start CL2 AFTER target's churn has
+          # ended and produce useless rows for the A/B. Bump concurrency to
+          # mesh_size (== cluster_count) for isolation. Safe at n=20 because
+          # peers SLEEP during the kill window — 1 heavy container + 19
+          # idle ones easily fits the agent. Same override applies to
+          # node-churn-* scenarios: node-churner.sh's ready-sentinel
+          # barrier requires every cluster's CL2 to be running before the
+          # first nodepool op fires.
+          if needs_mesh_wide_concurrency "$SCENARIO"; then
+            EFFECTIVE_MAX_CONCURRENT="${cluster_count}"
+            echo "Scenario ${SCENARIO}: overriding max_concurrent ${CL2_MAX_CONCURRENT:-4} → ${EFFECTIVE_MAX_CONCURRENT} (mesh-wide observation required)"
+          else
+            EFFECTIVE_MAX_CONCURRENT="${CL2_MAX_CONCURRENT:-4}"
+          fi
+          # Launch the host-side stimulus driver for node-churn-* scenarios
+          # BEFORE execute-parallel so the churner is ready to consume CL2
+          # sentinels as soon as the per-cluster CL2 containers start.
+          NODE_CHURNER_PID=""
+          if is_node_churn_scenario "$SCENARIO"; then
+            launch_node_churner "$SCENARIO" "${CL2_REPORT_DIR}/${SCENARIO}"
+          fi
+          scenario_rc=0
+          PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute-parallel \
+            --clusters "$HOME/.kube/clustermesh-clusters.json" \
+            --max-concurrent "${EFFECTIVE_MAX_CONCURRENT}" \
+            --worker-script "$WORKER_SCRIPT" \
+            --cl2-image "${CL2_IMAGE}" \
+            --cl2-config-dir "${CL2_CONFIG_DIR}" \
+            --cl2-config-file "${SCENARIO}.yaml" \
+            --cl2-report-dir-base "${CL2_REPORT_DIR}/${SCENARIO}" \
+            --provider "${CLOUD}" \
+            --python-script-file "$PYTHON_SCRIPT_FILE" \
+            --python-workdir "$(pwd)" \
+            --tear-down-prometheus || scenario_rc=$?
 
-          echo "------- monitoring/* pods -------"
-          KUBECONFIG="$kubeconfig" kubectl -n monitoring get pods -o wide 2>&1 || true
+          # Join node-churner BEFORE finalizing scenario_rc — the churner's
+          # finalizer must complete (pool restored to original count) before
+          # the next scenario starts, otherwise the next CL2 invocation
+          # could run against an in-flux topology.
+          wait_node_churner "$SCENARIO"
 
-          echo "------- monitoring statefulsets -------"
-          KUBECONFIG="$kubeconfig" kubectl -n monitoring get statefulset -o wide 2>&1 || true
+          # Proactive failure debug dump (added 2026-05-14 after build 67114).
+          # User direction: assume failure, keep debug logs persistent across
+          # runs; remove only after green. Runs unconditionally for node-churn
+          # AND upper-bound scenarios (both have rich state worth dumping
+          # whether or not CL2 succeeded); runs only on rc!=0 for others.
+          if is_node_churn_scenario "$SCENARIO" || is_upper_bound_scenario "$SCENARIO" || [ "$scenario_rc" -ne 0 ]; then
+            scenario_failure_diag "$SCENARIO" "$scenario_rc"
+          fi
 
-          echo "------- Prometheus CR (operator input) -------"
-          KUBECONFIG="$kubeconfig" kubectl -n monitoring get prometheus -o yaml 2>&1 | head -80 || true
+          # Treat finalizer cleanup_failed as a hard fail of the share-infra
+          # loop — running additional scenarios against a half-scaled cluster
+          # would contaminate their data.
+          if is_node_churn_scenario "$SCENARIO"; then
+            _churner_timing_file="${CL2_REPORT_DIR}/${SCENARIO}/${CL2_NODE_CHURN_TARGET_CONTEXT}/NodeChurnTimings_${CL2_NODE_CHURN_TARGET_CONTEXT}.json"
+            if [ -f "$_churner_timing_file" ]; then
+              _cleanup_failed=$(jq -r '.cleanup_failed // false' "$_churner_timing_file")
+              if [ "$_cleanup_failed" = "true" ]; then
+                echo "##vso[task.logissue type=error;] node-churner finalizer FAILED for ${SCENARIO}; aborting remaining share-infra scenarios to avoid contaminating their data on a half-scaled cluster"
+                overall_rc=1
+                break
+              fi
+            fi
+          fi
 
-          echo "------- prometheus-k8s pod describe -------"
-          KUBECONFIG="$kubeconfig" kubectl -n monitoring describe pod -l app.kubernetes.io/name=prometheus 2>&1 | tail -60 || true
+          if [ "$scenario_rc" -ne 0 ]; then
+            echo "##vso[task.logissue type=warning;] Scenario ${SCENARIO} exited rc=${scenario_rc}; subsequent scenarios will continue but the step's final exit reflects this failure"
+            overall_rc=$scenario_rc
+          fi
 
-          echo "------- prometheus-operator logs (tail 60) -------"
-          KUBECONFIG="$kubeconfig" kubectl -n monitoring logs -l app.kubernetes.io/name=prometheus-operator --tail=60 2>&1 || true
+          # Settle between scenarios — gives Cilium time to GC stale
+          # identities/endpoints/services from the previous scenario before
+          # the next scenario's measurement window begins. Last scenario
+          # skips the settle.
+          if [ "$scenario_idx" -lt "${#SCENARIO_LIST[@]}" ]; then
+            echo "Settle 60s between scenarios (kvstore GC + identity slot cooldown)..."
+            sleep 60
+          fi
+        done
 
-          echo "------- monitoring namespace events (recent) -------"
-          KUBECONFIG="$kubeconfig" kubectl -n monitoring get events --sort-by='.lastTimestamp' 2>&1 | tail -30 || true
-          echo "------- end CL2 FAILURE DIAG -------"
+        # Make the meta file available to collect.yml via a step variable —
+        # written as task.setvariable so the next step in the same job picks it up.
+        echo "##vso[task.setvariable variable=SHARE_INFRA_META]$SHARE_INFRA_META"
 
-          echo "##vso[task.logissue type=warning;] $role: CL2 run failed (junit missing or has failures/errors at $report_dir/junit.xml; continuing other clusters)"
-          failures=$((failures + 1))
+        echo "============================================="
+        echo "Share-infra summary: ${#SCENARIO_LIST[@]} scenarios processed, overall_rc=${overall_rc}"
+        echo "============================================="
+        # Phase 4b: do NOT exit with non-zero on per-scenario failure.
+        # If we did, AzDO's default succeeded() gate on subsequent steps
+        # (collect + upload + destroy) would SKIP them and we'd lose ALL
+        # data even when most scenarios succeeded. Instead, emit
+        # `task.complete result=SucceededWithIssues` so the step shows
+        # orange in the AzDO UI (not green, not red) while still allowing
+        # downstream steps to run. Per-scenario failures remain visible
+        # via the ##vso[task.logissue type=warning] lines emitted in the
+        # loop above; per-row failures are also queryable in Kusto via
+        # the status column.
+        #
+        # Genuinely catastrophic failures (validation errors above this
+        # block) still exit 1 — those happen BEFORE any data is gathered
+        # so skipping downstream is the right call.
+        if [ "$overall_rc" -ne 0 ]; then
+          echo "##vso[task.complete result=SucceededWithIssues;]"
         fi
-      done
+        exit 0
+      fi
 
-      if [ "$failures" -gt 0 ]; then
-        echo "##vso[task.logissue type=error;] CL2 failed on $failures cluster(s)"
-        exit 1
+      # Single-scenario path (default, unchanged from Phase 4a — prod pipeline
+      # relies on this).
+      #
+      # Bounded-parallel CL2 fan-out across clusters. Each worker invokes
+      # run-cl2-on-cluster.sh — same per-cluster body the bash for-loop used
+      # to run sequentially (CL2 invoke + junit gate + log capture + failure
+      # diag), now with bounded concurrency. CL2_MAX_CONCURRENT defaults to 4
+      # at the matrix level (event-throughput.yaml); smaller tiers can lower
+      # it to 1 to recover sequential behavior if needed.
+      #
+      # Same per-scenario override as the share-infra loop above: isolation
+      # and node-churn-* need mesh-wide concurrent observation.
+      SINGLE_SCENARIO_BASENAME="${CL2_CONFIG_FILE%.yaml}"
+      if needs_mesh_wide_concurrency "$SINGLE_SCENARIO_BASENAME"; then
+        EFFECTIVE_MAX_CONCURRENT="${cluster_count}"
+        echo "Single-scenario ${SINGLE_SCENARIO_BASENAME}: overriding max_concurrent ${CL2_MAX_CONCURRENT:-4} → ${EFFECTIVE_MAX_CONCURRENT} (mesh-wide observation required)"
+      else
+        EFFECTIVE_MAX_CONCURRENT="${CL2_MAX_CONCURRENT:-4}"
+      fi
+      # Launch host-side stimulus for node-churn-* in single-scenario mode.
+      NODE_CHURNER_PID=""
+      if is_node_churn_scenario "$SINGLE_SCENARIO_BASENAME"; then
+        launch_node_churner "$SINGLE_SCENARIO_BASENAME" "${CL2_REPORT_DIR}"
+      fi
+      single_scenario_rc=0
+      PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute-parallel \
+        --clusters "$HOME/.kube/clustermesh-clusters.json" \
+        --max-concurrent "${EFFECTIVE_MAX_CONCURRENT}" \
+        --worker-script "$WORKER_SCRIPT" \
+        --cl2-image "${CL2_IMAGE}" \
+        --cl2-config-dir "${CL2_CONFIG_DIR}" \
+        --cl2-config-file "${CL2_CONFIG_FILE}" \
+        --cl2-report-dir-base "${CL2_REPORT_DIR}" \
+        --provider "${CLOUD}" \
+        --python-script-file "$PYTHON_SCRIPT_FILE" \
+        --python-workdir "$(pwd)" || single_scenario_rc=$?
+      wait_node_churner "$SINGLE_SCENARIO_BASENAME"
+      # Proactive failure debug dump for single-scenario mode too. Run
+      # unconditionally for node-churn AND upper-bound (rich state worth
+      # dumping regardless of success); rc!=0 for everything else.
+      if is_node_churn_scenario "$SINGLE_SCENARIO_BASENAME" || is_upper_bound_scenario "$SINGLE_SCENARIO_BASENAME" || [ "$single_scenario_rc" -ne 0 ]; then
+        scenario_failure_diag "$SINGLE_SCENARIO_BASENAME" "$single_scenario_rc"
       fi
+      # In single-scenario prod mode we DON'T have a share-infra loop to
+      # break out of, but we still want the AzDO step to surface non-zero
+      # rc on CL2 failure (prod's existing contract). The churner-finalizer
+      # cleanup_failed state is logged via the timing JSON (Kusto-visible);
+      # we don't promote it to step failure here because terraform destroy
+      # will tear down the cluster regardless.
+      exit $single_scenario_rc
     workingDirectory: modules/python
     env:
       ${{ if eq(parameters.cloud, 'azure') }}:
@@ -198,9 +739,11 @@ steps:
         CLOUD: ${{ parameters.cloud }}
       REGION: ${{ parameters.region }}
       PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/scale.py
+      WORKER_SCRIPT: $(Pipeline.Workspace)/s/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh
       CL2_IMAGE: ${{ parameters.engine_input.image }}
       CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/config
       CL2_CONFIG_FILE: $(cl2_config_file)
       CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/results
       CL2_OPERATION_TIMEOUT: ${{ parameters.engine_input.operation_timeout }}
+      NODE_CHURNER_SCRIPT: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/config/node-churner.sh
     displayName: "Run CL2 across all clustermesh clusters"
diff --git a/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh b/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh
new file mode 100755
index 0000000000..c20a66f0f6
--- /dev/null
+++ b/steps/engine/clusterloader2/clustermesh-scale/run-cl2-on-cluster.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+# Per-cluster CL2 worker for the clustermesh-scale scenario.
+#
+# Extracted from steps/engine/clusterloader2/clustermesh-scale/execute.yml
+# so that scale.py execute-parallel can fan out N copies of this script with
+# bounded concurrency. The body MUST stay equivalent to the original
+# per-iteration bash for-loop body (CL2 invoke + junit check + log capture +
+# failure diag) — see PR #1157 phase 3 for the parallelization rationale.
+#
+# Per-cluster log capture + failure diag happen IMMEDIATELY when this
+# cluster's CL2 finishes — before peer clusters complete — so that
+# `kubectl --tail` log windows and `kubectl get events` recency don't age out
+# while peers are still running.
+#
+# Exit code:
+#   0 — CL2 ran AND junit.xml reports failures=0 errors=0
+#   1 — anything else (CL2 didn't write junit, or junit has failures/errors)
+# This is the authoritative per-cluster pass/fail signal that
+# scale.py execute-parallel aggregates into the step's exit code.
+#
+# Usage:
+#   run-cl2-on-cluster.sh \
+#     <role> <kubeconfig> <report_dir> \
+#     <cl2_image> <cl2_config_dir> <cl2_config_file> \
+#     <provider> <python_script_file> <python_workdir> \
+#     [tear_down_prometheus_flag]
+#
+# tear_down_prometheus_flag: "1" → pass --tear-down-prometheus to scale.py
+# execute. Used by share-infra mode so each scenario's CL2 deploys a fresh
+# Prom. "0" or unset → preserve Prom for failure-diagnostic dump (default
+# single-scenario behavior).
+
+set -uo pipefail
+
+if [ "$#" -lt 9 ] || [ "$#" -gt 10 ]; then
+  echo "Usage: $0 <role> <kubeconfig> <report_dir> <cl2_image> <cl2_config_dir> <cl2_config_file> <provider> <python_script_file> <python_workdir> [tear_down_prometheus_flag]" >&2
+  exit 2
+fi
+
+role="$1"
+kubeconfig="$2"
+report_dir="$3"
+cl2_image="$4"
+cl2_config_dir="$5"
+cl2_config_file="$6"
+provider="$7"
+python_script_file="$8"
+python_workdir="$9"
+tear_down_prometheus_flag="${10:-0}"
+
+mkdir -p "$report_dir"
+
+echo "===================================================================="
+echo "  Running CL2 on $role"
+echo "===================================================================="
+
+cl2_passed=0
+# Run CL2; collect outcome WITHOUT failing on a non-zero exit (so we can
+# also inspect junit.xml for internal test failures even when CL2 exits
+# 0). Treat as "passed" only if BOTH:
+#   (a) junit.xml exists (CL2 actually completed and wrote a report)
+#   (b) junit.xml has zero <failure>/<error> elements
+# Without (b) we'd silently green-light runs where measurements failed
+# — e.g. PodMonitor template substitution producing "<no value>", which
+# k8s admission rejects but CL2 still writes junit with <failure> tags.
+exec_extra_args=()
+if [ "$tear_down_prometheus_flag" = "1" ]; then
+  exec_extra_args+=(--tear-down-prometheus)
+fi
+(
+  cd "$python_workdir" || exit 1
+  PYTHONPATH="${PYTHONPATH:-}:$python_workdir" python3 -u "$python_script_file" execute \
+    --cl2-image "$cl2_image" \
+    --cl2-config-dir "$cl2_config_dir" \
+    --cl2-report-dir "$report_dir" \
+    --cl2-config-file "$cl2_config_file" \
+    --kubeconfig "$kubeconfig" \
+    --provider "$provider" \
+    "${exec_extra_args[@]}"
+) || true
+
+if [ -f "$report_dir/junit.xml" ]; then
+  # Count failure/error attrs from <testsuite ... failures="N" errors="M">.
+  junit_failures=$(grep -oE 'failures="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0)
+  junit_errors=$(grep -oE 'errors="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0)
+  junit_failures=${junit_failures:-0}
+  junit_errors=${junit_errors:-0}
+  if [ "$junit_failures" -eq 0 ] && [ "$junit_errors" -eq 0 ]; then
+    cl2_passed=1
+  else
+    echo "##vso[task.logissue type=warning;] $role: junit.xml reports failures=$junit_failures errors=$junit_errors"
+  fi
+fi
+
+if [ "$cl2_passed" -eq 1 ]; then
+  echo "  $role: CL2 run succeeded"
+fi
+
+# Always-on log capture (spec line 35: "Logs: clustermesh-apiserver,
+# agent watchers"). Files land in $report_dir/logs/ so they are
+# uploaded alongside junit.xml + measurement results when the
+# publish step runs. Capturing PER CLUSTER as soon as that cluster's CL2
+# finishes is important under parallel fan-out: if we waited until all
+# peers completed, --tail windows and recent-events queries would age out
+# diagnostic data on the cluster that finished first.
+log_dir="$report_dir/logs"
+mkdir -p "$log_dir"
+echo "------- $role: capturing pod logs to $log_dir -------"
+# clustermesh-apiserver: all three containers (apiserver / etcd /
+# kvstoremesh) — bounded tail, single pod expected.
+for c in apiserver etcd kvstoremesh; do
+  KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
+    -l k8s-app=clustermesh-apiserver -c "$c" --tail=4000 \
+    > "$log_dir/clustermesh-apiserver-$c.log" 2>&1 || true
+done
+# cilium-agent: one pod per node — keep tail small to bound size.
+KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
+  -l k8s-app=cilium --tail=1000 --prefix=true \
+  > "$log_dir/cilium-agent.log" 2>&1 || true
+# cilium-operator: low-volume control plane.
+KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
+  -l io.cilium/app=operator --tail=2000 --prefix=true \
+  > "$log_dir/cilium-operator.log" 2>&1 || true
+
+if [ "$cl2_passed" -ne 1 ]; then
+  # Dump enough state to distinguish prometheus-stack scheduling
+  # failures from CL2 logic failures. Prometheus is the most common
+  # culprit here — its pod requests 10Gi by default, doesn't fit on
+  # Standard_D4s_v4. If the pod is Pending with FailedScheduling, the
+  # describe events make that obvious.
+  #
+  # Note: scale.py passes tear_down_prometheus=False so the stack
+  # survives this dump (otherwise CL2 would clean up before we look).
+  echo "------- $role: CL2 FAILURE DIAG -------"
+  echo "------- node allocatable / requested capacity -------"
+  KUBECONFIG="$kubeconfig" kubectl get nodes -o wide 2>&1 || true
+  KUBECONFIG="$kubeconfig" kubectl describe nodes 2>&1 | grep -A 4 "Allocatable\|Allocated resources" | head -40 || true
+
+  echo "------- monitoring/* pods -------"
+  KUBECONFIG="$kubeconfig" kubectl -n monitoring get pods -o wide 2>&1 || true
+
+  echo "------- monitoring statefulsets -------"
+  KUBECONFIG="$kubeconfig" kubectl -n monitoring get statefulset -o wide 2>&1 || true
+
+  echo "------- Prometheus CR (operator input) -------"
+  KUBECONFIG="$kubeconfig" kubectl -n monitoring get prometheus -o yaml 2>&1 | head -80 || true
+
+  echo "------- prometheus-k8s pod describe -------"
+  KUBECONFIG="$kubeconfig" kubectl -n monitoring describe pod -l app.kubernetes.io/name=prometheus 2>&1 | tail -60 || true
+
+  echo "------- prometheus-operator logs (tail 60) -------"
+  KUBECONFIG="$kubeconfig" kubectl -n monitoring logs -l app.kubernetes.io/name=prometheus-operator --tail=60 2>&1 || true
+
+  echo "------- monitoring namespace events (recent) -------"
+  KUBECONFIG="$kubeconfig" kubectl -n monitoring get events --sort-by='.lastTimestamp' 2>&1 | tail -30 || true
+  echo "------- end CL2 FAILURE DIAG -------"
+
+  echo "##vso[task.logissue type=warning;] $role: CL2 run failed (junit missing or has failures/errors at $report_dir/junit.xml)"
+  exit 1
+fi
+
+exit 0
diff --git a/steps/topology/clustermesh-scale/validate-resources.yml b/steps/topology/clustermesh-scale/validate-resources.yml
index bfd47a11c6..6f51411cb9 100644
--- a/steps/topology/clustermesh-scale/validate-resources.yml
+++ b/steps/topology/clustermesh-scale/validate-resources.yml
@@ -44,6 +44,90 @@ steps:
       echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$count"
     displayName: "Enumerate clustermesh clusters"
 
+  # ----------------------------------------------------------------------------
+  # Pre-gate: wait for every cluster's clustermesh-apiserver Deployment to be
+  # Available AND its Service to have an external LoadBalancer IP, in parallel.
+  #
+  # Why this step exists
+  # --------------------
+  # Fleet's ClusterMeshProfile reconciler only pushes a peer's kubeconfig into
+  # other clusters' apiserver configs once that peer's LB has an external IP.
+  # If we start the per-cluster peering loop below before every cluster's LB
+  # is up, the X/Y readout in `cilium-dbg status` stalls at "Y < N-1" — Fleet
+  # has only pushed the kubeconfigs for the subset of peers that ARE LB-ready,
+  # and bumping the retry budget in the loop doesn't help because the missing
+  # peer kubeconfigs will never arrive while their LBs are still pending.
+  #
+  # Empirically at N=20, ~25% of clustermesh-apiserver LBs are still pending
+  # IP assignment when terraform apply returns success, because Azure LB
+  # provisioning happens asynchronously after Service creation. Per-cluster
+  # budget is 30 min — longer than any LB tail we've observed.
+  # ----------------------------------------------------------------------------
+  - script: |
+      set -euo pipefail
+      set -x
+
+      clusters=$(cat "$HOME/.kube/clustermesh-clusters.json")
+      cluster_count=$(echo "$clusters" | jq 'length')
+
+      # Sequential kubeconfig fetch — parallel `az aks get-credentials`
+      # writes race on the shared ~/.azure MSAL token cache (same reason
+      # execute.yml pre-fetches kubeconfigs sequentially).
+      for row in $(echo "$clusters" | jq -c '.[]'); do
+        name=$(echo "$row" | jq -r '.name')
+        rg=$(echo   "$row" | jq -r '.rg')
+        role=$(echo "$row" | jq -r '.role')
+        kc="$HOME/.kube/$role.config"
+        KUBECONFIG="$kc" az aks get-credentials \
+          --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors
+      done
+
+      # Parallel poll for clustermesh-apiserver readiness on every cluster.
+      # Each subshell gets a 30-min budget; we collect failures rather than
+      # fail-fast on the first one so the operator sees the full set of
+      # slow LBs in one shot instead of one cluster at a time.
+      pids=()
+      roles=()
+      for row in $(echo "$clusters" | jq -c '.[]'); do
+        role=$(echo "$row" | jq -r '.role')
+        (
+          kc="$HOME/.kube/$role.config"
+          deadline=$(( $(date +%s) + 1800 ))
+          last_state=""
+          while [ "$(date +%s)" -lt "$deadline" ]; do
+            avail=$(KUBECONFIG="$kc" kubectl -n kube-system get deployment clustermesh-apiserver \
+                -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null || true)
+            ip=$(KUBECONFIG="$kc" kubectl -n kube-system get svc clustermesh-apiserver \
+                -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)
+            if [ "$avail" = "True" ] && [ -n "$ip" ]; then
+              echo "[$role] OK (deployment=Available, LB IP=$ip)"
+              exit 0
+            fi
+            last_state="deployment=${avail:-<none>}, LB=${ip:-<none>}"
+            sleep 15
+          done
+          echo "[$role] FAIL: clustermesh-apiserver not ready within 30 min ($last_state)" >&2
+          exit 1
+        ) &
+        pids+=("$!")
+        roles+=("$role")
+      done
+
+      failed=0
+      for i in "${!pids[@]}"; do
+        if ! wait "${pids[$i]}"; then
+          echo "##vso[task.logissue type=error;] ${roles[$i]}: clustermesh-apiserver not ready within 30 min"
+          failed=$((failed + 1))
+        fi
+      done
+
+      if [ "$failed" -gt 0 ]; then
+        echo "##vso[task.logissue type=error;] $failed of $cluster_count clustermesh-apiserver(s) not ready; peering will not converge"
+        exit 1
+      fi
+      echo "All $cluster_count clustermesh-apiserver Deployments+LBs ready; Fleet can now push peer configs"
+    displayName: "Wait for clustermesh-apiserver Deployments + LBs (parallel)"
+
   - script: |
       set -euo pipefail
       set -x
@@ -71,7 +155,39 @@ steps:
 
         echo "--- nodes ---"
         kubectl get nodes -o wide
-        kubectl wait --for=condition=Ready nodes --all --timeout=5m
+        # Wait until ALL nodes reach Ready. Originally a single
+        # `kubectl wait --timeout=5m` call, but a 5min hard timeout is
+        # brittle when 1-2 of N nodes flap NotReady transiently at
+        # startup (kubelet image pull, CNI sandbox init). Smoke build
+        # 67014 hit this — 2 of 21 nodes briefly NotReady, kubectl
+        # wait timed out, validate step failed, CL2 skipped (~30min
+        # of provisioned infra wasted).
+        #
+        # New behavior: retry-with-resample loop, 15min budget, 30s
+        # rechecks. Exits as soon as all nodes are Ready; gives a
+        # final diag dump on timeout (which clusters/nodes are still
+        # NotReady).
+        node_ready_deadline=$(( $(date +%s) + 900 ))
+        while true; do
+          if kubectl wait --for=condition=Ready nodes --all --timeout=30s >/dev/null 2>&1; then
+            echo "All nodes Ready"
+            break
+          fi
+          if [ "$(date +%s)" -ge "$node_ready_deadline" ]; then
+            echo "##vso[task.logissue type=error;] $role: node readiness timeout after 15 min"
+            echo "--- final node state ---"
+            kubectl get nodes -o wide || true
+            echo "--- NotReady nodes describe ---"
+            for n in $(kubectl get nodes --no-headers 2>/dev/null | awk '$2 != "Ready" {print $1}'); do
+              echo "--- $n ---"
+              kubectl describe node "$n" 2>&1 | head -50 || true
+            done
+            exit 1
+          fi
+          not_ready=$(kubectl get nodes --no-headers 2>/dev/null | awk '$2 != "Ready"' | wc -l)
+          echo "$(date -u +%H:%M:%S): ${not_ready} node(s) NotReady, waiting (deadline at $(date -u -d @${node_ready_deadline} +%H:%M:%S))"
+          sleep 30
+        done
 
         echo "--- cilium agent pods ---"
         kubectl -n kube-system get pods -l k8s-app=cilium -o wide
@@ -112,8 +228,14 @@ steps:
         # "configured/connected" first because it counts apiserver clients,
         # while the in-pod view requires the Secret to be reloaded. We gate on
         # the in-pod view because the data path needs the agent's local state.
+        # Mesh convergence retry budget. At N=20 we observed mesh-2 +
+        # mesh-6 take ~24 min to reach 19/19 connected (initial-sync + Fleet
+        # member-secret reload at scale). Budget of 120 * 15s = 30 min
+        # accommodates that slowest-cluster tail. Smaller N (2/5/10) finish
+        # in <5 min and exit the loop early via the break, so no cost on
+        # green runs at small N.
         connected=0
-        for i in $(seq 1 60); do
+        for i in $(seq 1 120); do
           out=$(kubectl -n kube-system exec ds/cilium -- cilium-dbg status 2>&1 || true)
           echo "$out"
           # Parse "<ready>/<total> remote clusters ready" line.
@@ -124,56 +246,8 @@ steps:
             break
           fi
 
-          # ============== DEBUG-DUMP-BEGIN (REMOVE BEFORE MERGE) ==============
-          # Every 6 iterations dump richer state: in-pod cilium-cli view of the
-          # mesh, clustermesh-apiserver pod state, and Fleet-side member status.
-          # These help diagnose why convergence is stalling. Strip before final
-          # PR review.
-          if [ "$((i % 6))" -eq 0 ]; then
-            echo "------- [debug] retry $i: cilium clustermesh status (runner cli) -------"
-            cilium clustermesh status --context "$(kubectl config current-context)" --wait=false 2>&1 || true
-
-            echo "------- [debug] retry $i: clustermesh-apiserver pods -------"
-            kubectl -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide 2>&1 || true
-            kubectl -n kube-system describe pods -l k8s-app=clustermesh-apiserver 2>&1 | tail -40 || true
-
-            echo "------- [debug] retry $i: clustermesh-apiserver service -------"
-            # Service of type LoadBalancer for the clustermesh-apiserver. If
-            # EXTERNAL-IP stays "<pending>", the AKS control-plane identity is
-            # missing Network Contributor on the VNet (cloud-controller-manager
-            # cannot provision the internal LB). Look in describe events for
-            # AuthorizationFailed / forbidden messages.
-            kubectl -n kube-system get svc clustermesh-apiserver -o wide 2>&1 || true
-            kubectl -n kube-system describe svc clustermesh-apiserver 2>&1 | tail -25 || true
-
-            echo "------- [debug] retry $i: cilium agent restarts / readiness -------"
-            kubectl -n kube-system get pods -l k8s-app=cilium -o wide 2>&1 || true
-
-            echo "------- [debug] retry $i: Fleet ClusterMeshProfile profile-level status -------"
-            # Profile-level mesh state (NotConnected/Connecting/Connected/Failed)
-            # plus the last operation error if any. This is the authoritative
-            # control-plane view of whether the mesh has converged.
-            az fleet clustermeshprofile show \
-              --resource-group "$rg" \
-              --fleet-name clustermesh-flt \
-              --name clustermesh-cmp \
-              --query "{state:properties.status.state, provisioningState:properties.provisioningState, lastError:properties.status.lastOperationError}" \
-              -o jsonc 2>&1 || true
-
-            echo "------- [debug] retry $i: Fleet ClusterMeshProfile members (connection state) -------"
-            # Per-member: provisioningState is just ARM-level (join accepted);
-            # meshProperties.status.state is the actual Cilium connection state.
-            az fleet clustermeshprofile list-members \
-              --resource-group "$rg" \
-              --fleet-name clustermesh-flt \
-              --name clustermesh-cmp \
-              --query "[].{name:name, provisioning:properties.provisioningState, mesh:properties.meshProperties.status.state, lastUpdated:properties.meshProperties.status.lastUpdatedAt, error:properties.meshProperties.status.error.message}" \
-              -o table 2>&1 || true
-          fi
-          # =============== DEBUG-DUMP-END (REMOVE BEFORE MERGE) ===============
-
-          echo "  waiting for $expected_remote remote clusters to be ready (got $ready), retry $i/60..."
-          sleep 10
+          echo "  waiting for $expected_remote remote clusters to be ready (got $ready), retry $i/120..."
+          sleep 15
         done
 
         if [ "$connected" -ne 1 ]; then
@@ -337,65 +411,6 @@ steps:
       done
 
       if [ "$ok" -ne 1 ]; then
-        # ============== SMOKE-FAILURE-DEBUG-DUMP (REMOVE BEFORE MERGE) ==============
-        # On failure, dump enough state to distinguish Cilium global-service
-        # sync issues from cross-VNet pod-IP routing issues. Specifically:
-        #   1. cilium clustermesh status — should show "Global services: 1" if sync OK
-        #   2. cilium service list (in-pod) — should have an entry for cm-smoke/echo
-        #      with remote-cluster backends in cluster 2
-        #   3. kubectl describe svc / get endpoints echo — k8s view (cluster 2 should
-        #      have NO local endpoints, that's expected)
-        #   4. From inside the curl pod: DNS resolve, then direct-IP curl to a
-        #      cluster-1 echo pod IP — bypasses ClusterIP, tests raw L3 across VNets
-        echo
-        echo "================ SMOKE FAILURE DIAG (cluster $first_role -- backend) ================"
-        KUBECONFIG="$kc_first"  cilium clustermesh status --context "$(KUBECONFIG="$kc_first"  kubectl config current-context)" --wait=false 2>&1 || true
-        KUBECONFIG="$kc_first"  kubectl -n "$ns" describe svc echo 2>&1 || true
-        KUBECONFIG="$kc_first"  kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true
-        KUBECONFIG="$kc_first"  kubectl -n "$ns" get pods -l app=echo -o wide 2>&1 || true
-        echo "------- $first_role: cilium-config (clustermesh-relevant flags) -------"
-        # Authoritative source for whether the cilium agent is configured to
-        # process global services. Look for: enable-cluster-mesh,
-        # cluster-mesh-shared-services, clustermesh-config, identity-allocation-mode,
-        # enable-services. AKS/ACNS may gate global services with a feature flag.
-        KUBECONFIG="$kc_first"  kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \
-          | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true
-        echo "------- $first_role: cilium service list (full, head 40) -------"
-        KUBECONFIG="$kc_first"  kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true
-        echo "------- $first_role: cilium-operator logs (tail 60) -------"
-        KUBECONFIG="$kc_first"  kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \
-          | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true
-
-        echo
-        echo "================ SMOKE FAILURE DIAG (cluster $second_role -- client) ================"
-        KUBECONFIG="$kc_second" cilium clustermesh status --context "$(KUBECONFIG="$kc_second" kubectl config current-context)" --wait=false 2>&1 || true
-        KUBECONFIG="$kc_second" kubectl -n "$ns" describe svc echo 2>&1 || true
-        KUBECONFIG="$kc_second" kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true
-        echo "------- $second_role: cilium-config (clustermesh-relevant flags) -------"
-        KUBECONFIG="$kc_second" kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \
-          | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true
-        echo "------- $second_role: cilium service list (full, head 40) -------"
-        KUBECONFIG="$kc_second" kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true
-        echo "------- $second_role: cilium-operator logs (tail 60) -------"
-        KUBECONFIG="$kc_second" kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \
-          | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true
-
-        echo
-        echo "------- DNS + direct-pod-IP probe from curl pod (bypass ClusterIP) -------"
-        # ClusterIP plumbing is a Cilium-clustermesh concern; direct pod-IP
-        # connectivity is a VNet-peering concern. Hitting a backend pod IP
-        # directly disambiguates the two failure modes.
-        KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- nslookup echo.cm-smoke.svc.cluster.local 2>&1 || true
-        backend_ip=$(KUBECONFIG="$kc_first" kubectl -n "$ns" get pod -l app=echo -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)
-        echo "first cluster's echo pod IP: ${backend_ip:-<none>}"
-        if [ -n "${backend_ip:-}" ]; then
-          KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- \
-            curl -fsS -m 5 "http://${backend_ip}:8080/hostname" 2>&1 || \
-            echo "  direct pod-IP curl ALSO failed → cross-VNet routing issue (peering / pod-CIDR routes)"
-        fi
-        echo "============================ END SMOKE DIAG ============================"
-        # =========================== END SMOKE-FAILURE-DEBUG-DUMP ===========================
-
         echo "##vso[task.logissue type=error;] Cross-cluster data-path smoke failed: $second_role could not reach service in $first_role"
         exit 1
       fi