From 1bdc29f2d7c8f4c63d822eaf9eec77701c25b1cf Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Thu, 4 Jun 2026 17:26:06 -0700
Subject: [PATCH] fix(validators): run performance phase last; fix stale
 conformance manifest hint paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reorder PhaseOrder from deployment→performance→conformance to
deployment→conformance→performance. The performance phase's inference-perf
benchmark saturates every GPU on the node and tears its DynamoGraphDeployment
(and DRA ResourceClaims) down asynchronously; running it before conformance
starved conformance's GPU-needing checks -- notably dra-support, whose 1-GPU
test pod failed 'Unschedulable: cannot allocate all claims' on single-node
clusters during 'aicr validate --phase all'. Running performance last frees
the GPUs for conformance and keeps a flaky perf phase from blocking
conformance under --fail-fast. DRA itself was verified healthy (standalone
GPU claim allocated); the failure was purely GPU contention.

Also fix stale 'Equivalent: kubectl apply -f' hint paths in the conformance
check artifacts: docs/conformance/cncf/manifests/ does not exist; the test
manifests live under pkg/evidence/cncf/scripts/manifests/. Repoint
dra-support, pod-autoscaling, and cluster-autoscaling hints (the latter
referenced a non-existent hpa-gpu-scale-test.yaml -> the real hpa-gpu-test.yaml)
and fix the same stale path in collect-evidence.sh. Cosmetic only -- the
checks build their resources in-code via the K8s API, not from these files.
---
 docs/contributor/validator.md                         | 10 ++++++++--
 docs/design/002-validatorv2-adr.md                    |  4 ++--
 docs/user/validation.md                               |  8 ++++----
 pkg/client/v1/aicr.go                                 |  4 ++--
 pkg/client/v1/aicr_test.go                            |  2 +-
 pkg/evidence/attestation/types.go                     |  6 +++++-
 pkg/evidence/cncf/scripts/collect-evidence.sh         |  6 +++---
 pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml |  2 +-
 .../cncf/scripts/manifests/gang-scheduling-test.yaml  |  2 +-
 pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml |  2 +-
 pkg/validator/phases.go                               | 11 ++++++++++-
 pkg/validator/v1/job_plan.go                          |  2 +-
 pkg/validator/validator_test.go                       |  5 ++++-
 validators/conformance/cluster_autoscaling_check.go   |  2 +-
 validators/conformance/dra_support_check.go           |  2 +-
 validators/conformance/pod_autoscaling_check.go       |  2 +-
 16 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/docs/contributor/validator.md b/docs/contributor/validator.md
index 83a2501f6..1d83eaf14 100644
--- a/docs/contributor/validator.md
+++ b/docs/contributor/validator.md
@@ -92,13 +92,19 @@ per run. Per-phase containers are built from
 `recipes/validators/catalog.yaml` is the authoritative list.
 
 **Three phases**, evaluated in this fixed order
-(`pkg/validator/phases.go`):
+(`pkg/validator/phases.go`): **deployment → conformance → performance**.
 
 | Phase | Purpose | Example |
 |-------|---------|---------|
 | `deployment` | Components installed and healthy | GPU operator pods running |
-| `performance` | Cluster meets perf thresholds | NCCL bandwidth, AIPerf TTFT p99 |
 | `conformance` | Workload-specific requirements | DRA, gang scheduling, autoscaling |
+| `performance` | Cluster meets perf thresholds | NCCL bandwidth, AIPerf TTFT p99 |
+
+Performance runs **last** on purpose: its inference-perf benchmark saturates
+every GPU on the node and tears the DynamoGraphDeployment (and its DRA
+ResourceClaims) down asynchronously. Running it before conformance starved
+conformance's GPU-needing checks (notably `dra-support`, whose 1-GPU test pod
+failed to schedule with "cannot allocate all claims" on single-node clusters).
 
 `PhaseAll` (the string `"all"`) is the CLI / recipe wildcard;
 `ParsePhaseSelection` collapses it to nil-meaning-everything. It is
diff --git a/docs/design/002-validatorv2-adr.md b/docs/design/002-validatorv2-adr.md
index 7b27f9834..851c540e0 100644
--- a/docs/design/002-validatorv2-adr.md
+++ b/docs/design/002-validatorv2-adr.md
@@ -10,7 +10,7 @@ containers in `validators/`. The v1 engine has been deleted.
 ## Context
 
 AICR validates GPU-accelerated Kubernetes clusters through a multi-phase pipeline
-(readiness, deployment, performance, conformance). The current implementation
+(readiness, deployment, conformance, performance). The current implementation
 (`pkg/validator`) uses Go's `testing.T` framework as a runtime execution engine
 inside Kubernetes Jobs:
 
@@ -178,7 +178,7 @@ ValidateAll(ctx, recipe, snapshot)
 ├── EnsureRBAC()                    # Once (SA + CRB)
 ├── ensureDataConfigMaps()          # Once (snapshot + recipe)
 │
-├── For phase in [deployment, performance, conformance]:
+├── For phase in [deployment, conformance, performance]:
 │   ├── Skip if previous phase failed
 │   ├── For each validator (sequentially):
 │   │   ├── Deploy Job
diff --git a/docs/user/validation.md b/docs/user/validation.md
index ec49b567a..416bfb116 100644
--- a/docs/user/validation.md
+++ b/docs/user/validation.md
@@ -2,8 +2,8 @@
 
 Task-oriented walkthrough for running `aicr validate` against a GPU cluster — from
 capturing a snapshot through interpreting results. Covers both training and
-inference workloads and all three validation phases (deployment, performance,
-conformance).
+inference workloads and all three validation phases (deployment, conformance,
+performance).
 
 For per-flag reference, see [CLI reference: aicr validate](cli-reference.md#aicr-validate).
 For the architectural view of how snapshot + recipe flow into the validator, see
@@ -14,8 +14,8 @@ For the architectural view of how snapshot + recipe flow into the validator, see
 | Phase | What it answers | Typical trigger |
 |-------|-----------------|-----------------|
 | `deployment` | Are the components the recipe asks for actually installed and healthy? | After `./deploy.sh` finishes, before running any workload |
-| `performance` | Does the cluster hit expected bandwidth / throughput thresholds? | After components are ready; before going to production |
 | `conformance` | Does the cluster support workload-specific capabilities (DRA, gang scheduling, autoscaling, ...)? | Before opening the cluster to real workloads |
+| `performance` | Does the cluster hit expected bandwidth / throughput thresholds? | After components are ready; before going to production |
 
 Readiness pre-flight constraints (K8s version, OS, kernel) run implicitly before
 any phase. If pre-flight fails, no validator Jobs are deployed.
@@ -265,7 +265,7 @@ Guards fire before any cluster mutation, so skips are cheap (typically < 10 s).
 
 ```bash
 aicr validate --recipe recipe.yaml --snapshot snapshot.yaml
-# equivalent to: --phase deployment --phase performance --phase conformance
+# equivalent to: --phase deployment --phase conformance --phase performance
 ```
 
 Phases run sequentially. By default all phases run and produce results
diff --git a/pkg/client/v1/aicr.go b/pkg/client/v1/aicr.go
index 9d6951d16..7aadc6b79 100644
--- a/pkg/client/v1/aicr.go
+++ b/pkg/client/v1/aicr.go
@@ -30,7 +30,7 @@
 //     each component in a *RecipeResult.
 //   - CollectSnapshot — deploy the snapshotter Job and retrieve a *Snapshot.
 //   - ValidateState — evaluate a resolved recipe against a snapshot,
-//     running deployment / performance / conformance phases.
+//     running deployment / conformance / performance phases.
 //
 // All facade types (Snapshot, AgentConfig, Criteria, RecipeRequest,
 // RecipeResult, ComponentBundle, ComponentRef, PhaseResult, AllowLists)
@@ -1100,7 +1100,7 @@ func (c *Client) CollectSnapshot(ctx context.Context, cfg *AgentConfig) (*Snapsh
 
 // ValidateState evaluates a resolved recipe against an observed cluster
 // snapshot, runs the selected validation phases (by default
-// PhaseDeployment, PhasePerformance, PhaseConformance) in order, and
+// PhaseDeployment, PhaseConformance, PhasePerformance) in order, and
 // returns one PhaseResult per phase run. Pass WithValidationPhases to
 // restrict the run to a subset.
 //
diff --git a/pkg/client/v1/aicr_test.go b/pkg/client/v1/aicr_test.go
index 3168cc202..0e6766553 100644
--- a/pkg/client/v1/aicr_test.go
+++ b/pkg/client/v1/aicr_test.go
@@ -1297,7 +1297,7 @@ func TestValidateState_PhaseSelection(t *testing.T) {
 		{
 			name:   "unset runs all phases",
 			phases: nil,
-			want:   []aicr.Phase{aicr.PhaseDeployment, aicr.PhasePerformance, aicr.PhaseConformance},
+			want:   []aicr.Phase{aicr.PhaseDeployment, aicr.PhaseConformance, aicr.PhasePerformance},
 		},
 	}
 
diff --git a/pkg/evidence/attestation/types.go b/pkg/evidence/attestation/types.go
index 4a70f5401..00a65a5a9 100644
--- a/pkg/evidence/attestation/types.go
+++ b/pkg/evidence/attestation/types.go
@@ -90,7 +90,11 @@ const (
 	PhaseConformance Phase = "conformance"
 )
 
-// AllPhases is the canonical iteration order for deterministic output.
+// AllPhases is the canonical iteration order for deterministic attestation
+// output. It is intentionally fixed and independent of the validator's
+// execution order (pkg/validator.PhaseOrder, which runs performance last) —
+// freezing it keeps attestation predicate bytes reproducible across releases
+// regardless of execution-order changes.
 var AllPhases = []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
 
 // Predicate is the body of the signed in-toto Statement. It serializes
diff --git a/pkg/evidence/cncf/scripts/collect-evidence.sh b/pkg/evidence/cncf/scripts/collect-evidence.sh
index 200ab03cf..8d59c8513 100755
--- a/pkg/evidence/cncf/scripts/collect-evidence.sh
+++ b/pkg/evidence/cncf/scripts/collect-evidence.sh
@@ -242,7 +242,7 @@ EOF
 
 Deploy a test pod that requests 1 GPU via ResourceClaim and verifies device access.
 
-**Test manifest:** `pkg/evidence/scripts/manifests/dra-gpu-test.yaml`
+**Test manifest:** `pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml`
 EOF
     echo '```yaml' >> "${EVIDENCE_FILE}"
     cat "${SCRIPT_DIR}/manifests/dra-gpu-test.yaml" >> "${EVIDENCE_FILE}"
@@ -311,7 +311,7 @@ EOF
 Deploy a PodGroup with minMember=2 and two GPU pods. KAI scheduler ensures both
 pods are scheduled atomically.
 
-**Test manifest:** `pkg/evidence/scripts/manifests/gang-scheduling-test.yaml`
+**Test manifest:** `pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml`
 EOF
     echo '```yaml' >> "${EVIDENCE_FILE}"
     cat "${SCRIPT_DIR}/manifests/gang-scheduling-test.yaml" >> "${EVIDENCE_FILE}"
@@ -1824,7 +1824,7 @@ EOF
 Deploy a GPU workload running CUDA N-Body Simulation to generate sustained GPU utilization,
 then create an HPA targeting `gpu_utilization` to demonstrate autoscaling.
 
-**Test manifest:** `pkg/evidence/scripts/manifests/hpa-gpu-test.yaml`
+**Test manifest:** `pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml`
 EOF
     echo '```yaml' >> "${EVIDENCE_FILE}"
     cat "${SCRIPT_DIR}/manifests/hpa-gpu-test.yaml" >> "${EVIDENCE_FILE}"
diff --git a/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml b/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml
index fc9152d63..bb507f351 100644
--- a/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml
+++ b/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # DRA GPU allocation test
-# Usage: kubectl apply -f pkg/evidence/scripts/manifests/dra-gpu-test.yaml
+# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml
 ---
 apiVersion: v1
 kind: Namespace
diff --git a/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml b/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml
index 75732a8a9..bbdb258fb 100644
--- a/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml
+++ b/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml
@@ -15,7 +15,7 @@
 # Gang scheduling test with PodGroup, DRA ResourceClaims, and KAI scheduler.
 # Demonstrates all-or-nothing scheduling: both pods must be scheduled together.
 # Requires: KAI scheduler with PodGroup CRD, DRA driver (gpu.nvidia.com)
-# Usage: kubectl apply -f pkg/evidence/scripts/manifests/gang-scheduling-test.yaml
+# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml
 ---
 apiVersion: v1
 kind: Namespace
diff --git a/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml b/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml
index afdc04ebe..c06edde64 100644
--- a/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml
+++ b/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml
@@ -14,7 +14,7 @@
 
 # HPA Pod Autoscaling test with custom GPU metrics
 # Demonstrates HPA scaling based on gpu_utilization from prometheus-adapter
-# Usage: kubectl apply -f pkg/evidence/scripts/manifests/hpa-gpu-test.yaml
+# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml
 ---
 apiVersion: v1
 kind: Namespace
diff --git a/pkg/validator/phases.go b/pkg/validator/phases.go
index 35c0cfe8a..554a04c9e 100644
--- a/pkg/validator/phases.go
+++ b/pkg/validator/phases.go
@@ -40,9 +40,18 @@ const (
 // All phases run by default; set Validator.FailFast to stop after the
 // first phase that reports StatusFailed.
 //
+// Order rationale: deployment (cheap install/health checks) → conformance
+// → performance. Performance runs LAST because its inference-perf benchmark
+// saturates every GPU on the node and the DynamoGraphDeployment teardown
+// releases those DRA ResourceClaims asynchronously; running it before
+// conformance starved conformance's GPU-needing checks (e.g. dra-support,
+// whose 1-GPU test pod failed "cannot allocate all claims" on single-node
+// clusters). Running performance last also keeps a flaky perf phase from
+// blocking conformance under FailFast.
+//
 // Note: Readiness phase is NOT included. It remains in pkg/validator
 // and uses inline constraint evaluation (no containers).
-var PhaseOrder = []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
+var PhaseOrder = []Phase{PhaseDeployment, PhaseConformance, PhasePerformance}
 
 // PhaseAll is the wildcard string accepted by both the `aicr validate
 // --phase` CLI flag and the spec.validate.execution.phases config field
diff --git a/pkg/validator/v1/job_plan.go b/pkg/validator/v1/job_plan.go
index 1f58f8366..d25fc5aba 100644
--- a/pkg/validator/v1/job_plan.go
+++ b/pkg/validator/v1/job_plan.go
@@ -169,7 +169,7 @@ func Plan(
 	}
 
 	// Iterate through all phases
-	phases := []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
+	phases := []Phase{PhaseDeployment, PhaseConformance, PhasePerformance}
 	for _, phase := range phases {
 		// Get all entries for this phase
 		allEntries := cat.ForPhase(phase)
diff --git a/pkg/validator/validator_test.go b/pkg/validator/validator_test.go
index 19b7530f8..ee2682dcb 100644
--- a/pkg/validator/validator_test.go
+++ b/pkg/validator/validator_test.go
@@ -263,7 +263,10 @@ func TestCheckReadinessUnparseableConstraintFailsClosed(t *testing.T) {
 }
 
 func TestPhaseOrder(t *testing.T) {
-	expected := []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
+	// performance runs last: its benchmark saturates all node GPUs and releases
+	// DRA claims asynchronously, which would otherwise starve conformance's
+	// GPU-needing checks (e.g. dra-support).
+	expected := []Phase{PhaseDeployment, PhaseConformance, PhasePerformance}
 	if len(PhaseOrder) != len(expected) {
 		t.Fatalf("PhaseOrder length = %d, want %d", len(PhaseOrder), len(expected))
 	}
diff --git a/validators/conformance/cluster_autoscaling_check.go b/validators/conformance/cluster_autoscaling_check.go
index e0dbf962f..1903419f5 100644
--- a/validators/conformance/cluster_autoscaling_check.go
+++ b/validators/conformance/cluster_autoscaling_check.go
@@ -178,7 +178,7 @@ func CheckClusterAutoscaling(ctx *validators.Context) error {
 		lastErr = validateErr
 		if lastErr == nil {
 			recordRawTextArtifact(ctx, "Apply test manifest",
-				"kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-scale-test.yaml",
+				"# cluster-autoscaling test resources (HPA + GPU deployment + Karpenter NodePool) constructed via the Kubernetes API; no static manifest",
 				fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s for nodePool=%s",
 					report.Namespace, report.DeploymentName, report.HPAName, report.NodePoolName))
 			recordRawTextArtifact(ctx, "Cluster Autoscaling Behavioral Test",
diff --git a/validators/conformance/dra_support_check.go b/validators/conformance/dra_support_check.go
index ffbbbcf45..9b1dfa8aa 100644
--- a/validators/conformance/dra_support_check.go
+++ b/validators/conformance/dra_support_check.go
@@ -146,7 +146,7 @@ func validateDRAAllocation(ctx *validators.Context, dynClient dynamic.Interface)
 		return err
 	}
 	recordRawTextArtifact(ctx, "Apply test manifest",
-		"kubectl apply -f docs/conformance/cncf/manifests/dra-gpu-test.yaml",
+		"kubectl apply -f pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml",
 		fmt.Sprintf("Created Namespace=%s ResourceClaim=%s Pod=%s via Kubernetes API",
 			draTestNamespace, run.claimName, run.podName))
 
diff --git a/validators/conformance/pod_autoscaling_check.go b/validators/conformance/pod_autoscaling_check.go
index bf94a0808..d301ada0c 100644
--- a/validators/conformance/pod_autoscaling_check.go
+++ b/validators/conformance/pod_autoscaling_check.go
@@ -185,7 +185,7 @@ func CheckPodAutoscaling(ctx *validators.Context) error {
 		return err
 	}
 	recordRawTextArtifact(ctx, "Apply test manifest",
-		"kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-test.yaml",
+		"kubectl apply -f pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml",
 		fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s via Kubernetes API",
 			hpaReport.Namespace, hpaReport.DeploymentName, hpaReport.HPAName))
 	recordRawTextArtifact(ctx, "HPA Behavioral Test",