NVIDIA · mchmarny · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
@@ -92,13 +92,19 @@ per run. Per-phase containers are built from
 `recipes/validators/catalog.yaml` is the authoritative list.
 
 **Three phases**, evaluated in this fixed order
-(`pkg/validator/phases.go`):
+(`pkg/validator/phases.go`): **deployment → conformance → performance**.
 
 | Phase | Purpose | Example |
 |-------|---------|---------|
 | `deployment` | Components installed and healthy | GPU operator pods running |
-| `performance` | Cluster meets perf thresholds | NCCL bandwidth, AIPerf TTFT p99 |
 | `conformance` | Workload-specific requirements | DRA, gang scheduling, autoscaling |
+| `performance` | Cluster meets perf thresholds | NCCL bandwidth, AIPerf TTFT p99 |
+
+Performance runs **last** on purpose: its inference-perf benchmark saturates
+every GPU on the node and tears the DynamoGraphDeployment (and its DRA
+ResourceClaims) down asynchronously. Running it before conformance starved
+conformance's GPU-needing checks (notably `dra-support`, whose 1-GPU test pod
+failed to schedule with "cannot allocate all claims" on single-node clusters).
 
 `PhaseAll` (the string `"all"`) is the CLI / recipe wildcard;
 `ParsePhaseSelection` collapses it to nil-meaning-everything. It is

@@ -10,7 +10,7 @@ containers in `validators/`. The v1 engine has been deleted.
 ## Context
 
 AICR validates GPU-accelerated Kubernetes clusters through a multi-phase pipeline
-(readiness, deployment, performance, conformance). The current implementation
+(readiness, deployment, conformance, performance). The current implementation
 (`pkg/validator`) uses Go's `testing.T` framework as a runtime execution engine
 inside Kubernetes Jobs:
 
@@ -178,7 +178,7 @@ ValidateAll(ctx, recipe, snapshot)
 ├── EnsureRBAC()                    # Once (SA + CRB)
 ├── ensureDataConfigMaps()          # Once (snapshot + recipe)
 │
-├── For phase in [deployment, performance, conformance]:
+├── For phase in [deployment, conformance, performance]:
 │   ├── Skip if previous phase failed
 │   ├── For each validator (sequentially):
 │   │   ├── Deploy Job

@@ -2,8 +2,8 @@
 
 Task-oriented walkthrough for running `aicr validate` against a GPU cluster — from
 capturing a snapshot through interpreting results. Covers both training and
-inference workloads and all three validation phases (deployment, performance,
-conformance).
+inference workloads and all three validation phases (deployment, conformance,
+performance).
 
 For per-flag reference, see [CLI reference: aicr validate](cli-reference.md#aicr-validate).
 For the architectural view of how snapshot + recipe flow into the validator, see
@@ -14,8 +14,8 @@ For the architectural view of how snapshot + recipe flow into the validator, see
 | Phase | What it answers | Typical trigger |
 |-------|-----------------|-----------------|
 | `deployment` | Are the components the recipe asks for actually installed and healthy? | After `./deploy.sh` finishes, before running any workload |
-| `performance` | Does the cluster hit expected bandwidth / throughput thresholds? | After components are ready; before going to production |
 | `conformance` | Does the cluster support workload-specific capabilities (DRA, gang scheduling, autoscaling, ...)? | Before opening the cluster to real workloads |
+| `performance` | Does the cluster hit expected bandwidth / throughput thresholds? | After components are ready; before going to production |
 
 Readiness pre-flight constraints (K8s version, OS, kernel) run implicitly before
 any phase. If pre-flight fails, no validator Jobs are deployed.
@@ -265,7 +265,7 @@ Guards fire before any cluster mutation, so skips are cheap (typically < 10 s).
 
 ```bash
 aicr validate --recipe recipe.yaml --snapshot snapshot.yaml
-# equivalent to: --phase deployment --phase performance --phase conformance
+# equivalent to: --phase deployment --phase conformance --phase performance
 ```
 
 Phases run sequentially. By default all phases run and produce results

@@ -30,7 +30,7 @@
 //     each component in a *RecipeResult.
 //   - CollectSnapshot — deploy the snapshotter Job and retrieve a *Snapshot.
 //   - ValidateState — evaluate a resolved recipe against a snapshot,
-//     running deployment / performance / conformance phases.
+//     running deployment / conformance / performance phases.
 //
 // All facade types (Snapshot, AgentConfig, Criteria, RecipeRequest,
 // RecipeResult, ComponentBundle, ComponentRef, PhaseResult, AllowLists)
@@ -1100,7 +1100,7 @@ func (c *Client) CollectSnapshot(ctx context.Context, cfg *AgentConfig) (*Snapsh
 
 // ValidateState evaluates a resolved recipe against an observed cluster
 // snapshot, runs the selected validation phases (by default
-// PhaseDeployment, PhasePerformance, PhaseConformance) in order, and
+// PhaseDeployment, PhaseConformance, PhasePerformance) in order, and
 // returns one PhaseResult per phase run. Pass WithValidationPhases to
 // restrict the run to a subset.
 //

@@ -1297,7 +1297,7 @@ func TestValidateState_PhaseSelection(t *testing.T) {
 		{
 			name:   "unset runs all phases",
 			phases: nil,
-			want:   []aicr.Phase{aicr.PhaseDeployment, aicr.PhasePerformance, aicr.PhaseConformance},
+			want:   []aicr.Phase{aicr.PhaseDeployment, aicr.PhaseConformance, aicr.PhasePerformance},
 		},
 	}
 

@@ -90,7 +90,11 @@ const (
 	PhaseConformance Phase = "conformance"
 )
 
-// AllPhases is the canonical iteration order for deterministic output.
+// AllPhases is the canonical iteration order for deterministic attestation
+// output. It is intentionally fixed and independent of the validator's
+// execution order (pkg/validator.PhaseOrder, which runs performance last) —
+// freezing it keeps attestation predicate bytes reproducible across releases
+// regardless of execution-order changes.
 var AllPhases = []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
 
 // Predicate is the body of the signed in-toto Statement. It serializes

@@ -242,7 +242,7 @@ EOF
 
 Deploy a test pod that requests 1 GPU via ResourceClaim and verifies device access.
 
-**Test manifest:** `pkg/evidence/scripts/manifests/dra-gpu-test.yaml`
+**Test manifest:** `pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml`
 EOF
     echo '```yaml' >> "${EVIDENCE_FILE}"
     cat "${SCRIPT_DIR}/manifests/dra-gpu-test.yaml" >> "${EVIDENCE_FILE}"
@@ -311,7 +311,7 @@ EOF
 Deploy a PodGroup with minMember=2 and two GPU pods. KAI scheduler ensures both
 pods are scheduled atomically.
 
-**Test manifest:** `pkg/evidence/scripts/manifests/gang-scheduling-test.yaml`
+**Test manifest:** `pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml`
 EOF
     echo '```yaml' >> "${EVIDENCE_FILE}"
     cat "${SCRIPT_DIR}/manifests/gang-scheduling-test.yaml" >> "${EVIDENCE_FILE}"
@@ -1824,7 +1824,7 @@ EOF
 Deploy a GPU workload running CUDA N-Body Simulation to generate sustained GPU utilization,
 then create an HPA targeting `gpu_utilization` to demonstrate autoscaling.
 
-**Test manifest:** `pkg/evidence/scripts/manifests/hpa-gpu-test.yaml`
+**Test manifest:** `pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml`
 EOF
     echo '```yaml' >> "${EVIDENCE_FILE}"
     cat "${SCRIPT_DIR}/manifests/hpa-gpu-test.yaml" >> "${EVIDENCE_FILE}"

@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # DRA GPU allocation test
-# Usage: kubectl apply -f pkg/evidence/scripts/manifests/dra-gpu-test.yaml
+# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml
 ---
 apiVersion: v1
 kind: Namespace

@@ -15,7 +15,7 @@
 # Gang scheduling test with PodGroup, DRA ResourceClaims, and KAI scheduler.
 # Demonstrates all-or-nothing scheduling: both pods must be scheduled together.
 # Requires: KAI scheduler with PodGroup CRD, DRA driver (gpu.nvidia.com)
-# Usage: kubectl apply -f pkg/evidence/scripts/manifests/gang-scheduling-test.yaml
+# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml
 ---
 apiVersion: v1
 kind: Namespace

@@ -14,7 +14,7 @@
 
 # HPA Pod Autoscaling test with custom GPU metrics
 # Demonstrates HPA scaling based on gpu_utilization from prometheus-adapter
-# Usage: kubectl apply -f pkg/evidence/scripts/manifests/hpa-gpu-test.yaml
+# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml
 ---
 apiVersion: v1
 kind: Namespace

@@ -40,9 +40,18 @@ const (
 // All phases run by default; set Validator.FailFast to stop after the
 // first phase that reports StatusFailed.
 //
+// Order rationale: deployment (cheap install/health checks) → conformance
+// → performance. Performance runs LAST because its inference-perf benchmark
+// saturates every GPU on the node and the DynamoGraphDeployment teardown
+// releases those DRA ResourceClaims asynchronously; running it before
+// conformance starved conformance's GPU-needing checks (e.g. dra-support,
+// whose 1-GPU test pod failed "cannot allocate all claims" on single-node
+// clusters). Running performance last also keeps a flaky perf phase from
+// blocking conformance under FailFast.
+//
 // Note: Readiness phase is NOT included. It remains in pkg/validator
 // and uses inline constraint evaluation (no containers).
-var PhaseOrder = []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
+var PhaseOrder = []Phase{PhaseDeployment, PhaseConformance, PhasePerformance}
 
 // PhaseAll is the wildcard string accepted by both the `aicr validate
 // --phase` CLI flag and the spec.validate.execution.phases config field

@@ -169,7 +169,7 @@ func Plan(
 	}
 
 	// Iterate through all phases
-	phases := []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
+	phases := []Phase{PhaseDeployment, PhaseConformance, PhasePerformance}
 	for _, phase := range phases {
 		// Get all entries for this phase
 		allEntries := cat.ForPhase(phase)

@@ -263,7 +263,10 @@ func TestCheckReadinessUnparseableConstraintFailsClosed(t *testing.T) {
 }
 
 func TestPhaseOrder(t *testing.T) {
-	expected := []Phase{PhaseDeployment, PhasePerformance, PhaseConformance}
+	// performance runs last: its benchmark saturates all node GPUs and releases
+	// DRA claims asynchronously, which would otherwise starve conformance's
+	// GPU-needing checks (e.g. dra-support).
+	expected := []Phase{PhaseDeployment, PhaseConformance, PhasePerformance}
 	if len(PhaseOrder) != len(expected) {
 		t.Fatalf("PhaseOrder length = %d, want %d", len(PhaseOrder), len(expected))
 	}

@@ -178,7 +178,7 @@ func CheckClusterAutoscaling(ctx *validators.Context) error {
 		lastErr = validateErr
 		if lastErr == nil {
 			recordRawTextArtifact(ctx, "Apply test manifest",
-				"kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-scale-test.yaml",
+				"# cluster-autoscaling test resources (HPA + GPU deployment + Karpenter NodePool) constructed via the Kubernetes API; no static manifest",
 				fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s for nodePool=%s",
 					report.Namespace, report.DeploymentName, report.HPAName, report.NodePoolName))
 			recordRawTextArtifact(ctx, "Cluster Autoscaling Behavioral Test",

@@ -146,7 +146,7 @@ func validateDRAAllocation(ctx *validators.Context, dynClient dynamic.Interface)
 		return err
 	}
 	recordRawTextArtifact(ctx, "Apply test manifest",
-		"kubectl apply -f docs/conformance/cncf/manifests/dra-gpu-test.yaml",
+		"kubectl apply -f pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml",
 		fmt.Sprintf("Created Namespace=%s ResourceClaim=%s Pod=%s via Kubernetes API",
 			draTestNamespace, run.claimName, run.podName))
 

@@ -185,7 +185,7 @@ func CheckPodAutoscaling(ctx *validators.Context) error {
 		return err
 	}
 	recordRawTextArtifact(ctx, "Apply test manifest",
-		"kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-test.yaml",
+		"kubectl apply -f pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml",
 		fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s via Kubernetes API",
 			hpaReport.Namespace, hpaReport.DeploymentName, hpaReport.HPAName))
 	recordRawTextArtifact(ctx, "HPA Behavioral Test",