From 1bdc29f2d7c8f4c63d822eaf9eec77701c25b1cf Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Thu, 4 Jun 2026 17:26:06 -0700 Subject: [PATCH] fix(validators): run performance phase last; fix stale conformance manifest hint paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorder PhaseOrder from deployment→performance→conformance to deployment→conformance→performance. The performance phase's inference-perf benchmark saturates every GPU on the node and tears its DynamoGraphDeployment (and DRA ResourceClaims) down asynchronously; running it before conformance starved conformance's GPU-needing checks -- notably dra-support, whose 1-GPU test pod failed 'Unschedulable: cannot allocate all claims' on single-node clusters during 'aicr validate --phase all'. Running performance last frees the GPUs for conformance and keeps a flaky perf phase from blocking conformance under --fail-fast. DRA itself was verified healthy (standalone GPU claim allocated); the failure was purely GPU contention. Also fix stale 'Equivalent: kubectl apply -f' hint paths in the conformance check artifacts: docs/conformance/cncf/manifests/ does not exist; the test manifests live under pkg/evidence/cncf/scripts/manifests/. Repoint dra-support, pod-autoscaling, and cluster-autoscaling hints (the latter referenced a non-existent hpa-gpu-scale-test.yaml -> the real hpa-gpu-test.yaml) and fix the same stale path in collect-evidence.sh. Cosmetic only -- the checks build their resources in-code via the K8s API, not from these files. --- docs/contributor/validator.md | 10 ++++++++-- docs/design/002-validatorv2-adr.md | 4 ++-- docs/user/validation.md | 8 ++++---- pkg/client/v1/aicr.go | 4 ++-- pkg/client/v1/aicr_test.go | 2 +- pkg/evidence/attestation/types.go | 6 +++++- pkg/evidence/cncf/scripts/collect-evidence.sh | 6 +++--- pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml | 2 +- .../cncf/scripts/manifests/gang-scheduling-test.yaml | 2 +- pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml | 2 +- pkg/validator/phases.go | 11 ++++++++++- pkg/validator/v1/job_plan.go | 2 +- pkg/validator/validator_test.go | 5 ++++- validators/conformance/cluster_autoscaling_check.go | 2 +- validators/conformance/dra_support_check.go | 2 +- validators/conformance/pod_autoscaling_check.go | 2 +- 16 files changed, 46 insertions(+), 24 deletions(-) diff --git a/docs/contributor/validator.md b/docs/contributor/validator.md index 83a2501f6..1d83eaf14 100644 --- a/docs/contributor/validator.md +++ b/docs/contributor/validator.md @@ -92,13 +92,19 @@ per run. Per-phase containers are built from `recipes/validators/catalog.yaml` is the authoritative list. **Three phases**, evaluated in this fixed order -(`pkg/validator/phases.go`): +(`pkg/validator/phases.go`): **deployment → conformance → performance**. | Phase | Purpose | Example | |-------|---------|---------| | `deployment` | Components installed and healthy | GPU operator pods running | -| `performance` | Cluster meets perf thresholds | NCCL bandwidth, AIPerf TTFT p99 | | `conformance` | Workload-specific requirements | DRA, gang scheduling, autoscaling | +| `performance` | Cluster meets perf thresholds | NCCL bandwidth, AIPerf TTFT p99 | + +Performance runs **last** on purpose: its inference-perf benchmark saturates +every GPU on the node and tears the DynamoGraphDeployment (and its DRA +ResourceClaims) down asynchronously. Running it before conformance starved +conformance's GPU-needing checks (notably `dra-support`, whose 1-GPU test pod +failed to schedule with "cannot allocate all claims" on single-node clusters). `PhaseAll` (the string `"all"`) is the CLI / recipe wildcard; `ParsePhaseSelection` collapses it to nil-meaning-everything. It is diff --git a/docs/design/002-validatorv2-adr.md b/docs/design/002-validatorv2-adr.md index 7b27f9834..851c540e0 100644 --- a/docs/design/002-validatorv2-adr.md +++ b/docs/design/002-validatorv2-adr.md @@ -10,7 +10,7 @@ containers in `validators/`. The v1 engine has been deleted. ## Context AICR validates GPU-accelerated Kubernetes clusters through a multi-phase pipeline -(readiness, deployment, performance, conformance). The current implementation +(readiness, deployment, conformance, performance). The current implementation (`pkg/validator`) uses Go's `testing.T` framework as a runtime execution engine inside Kubernetes Jobs: @@ -178,7 +178,7 @@ ValidateAll(ctx, recipe, snapshot) ├── EnsureRBAC() # Once (SA + CRB) ├── ensureDataConfigMaps() # Once (snapshot + recipe) │ -├── For phase in [deployment, performance, conformance]: +├── For phase in [deployment, conformance, performance]: │ ├── Skip if previous phase failed │ ├── For each validator (sequentially): │ │ ├── Deploy Job diff --git a/docs/user/validation.md b/docs/user/validation.md index ec49b567a..416bfb116 100644 --- a/docs/user/validation.md +++ b/docs/user/validation.md @@ -2,8 +2,8 @@ Task-oriented walkthrough for running `aicr validate` against a GPU cluster — from capturing a snapshot through interpreting results. Covers both training and -inference workloads and all three validation phases (deployment, performance, -conformance). +inference workloads and all three validation phases (deployment, conformance, +performance). For per-flag reference, see [CLI reference: aicr validate](cli-reference.md#aicr-validate). For the architectural view of how snapshot + recipe flow into the validator, see @@ -14,8 +14,8 @@ For the architectural view of how snapshot + recipe flow into the validator, see | Phase | What it answers | Typical trigger | |-------|-----------------|-----------------| | `deployment` | Are the components the recipe asks for actually installed and healthy? | After `./deploy.sh` finishes, before running any workload | -| `performance` | Does the cluster hit expected bandwidth / throughput thresholds? | After components are ready; before going to production | | `conformance` | Does the cluster support workload-specific capabilities (DRA, gang scheduling, autoscaling, ...)? | Before opening the cluster to real workloads | +| `performance` | Does the cluster hit expected bandwidth / throughput thresholds? | After components are ready; before going to production | Readiness pre-flight constraints (K8s version, OS, kernel) run implicitly before any phase. If pre-flight fails, no validator Jobs are deployed. @@ -265,7 +265,7 @@ Guards fire before any cluster mutation, so skips are cheap (typically < 10 s). ```bash aicr validate --recipe recipe.yaml --snapshot snapshot.yaml -# equivalent to: --phase deployment --phase performance --phase conformance +# equivalent to: --phase deployment --phase conformance --phase performance ``` Phases run sequentially. By default all phases run and produce results diff --git a/pkg/client/v1/aicr.go b/pkg/client/v1/aicr.go index 9d6951d16..7aadc6b79 100644 --- a/pkg/client/v1/aicr.go +++ b/pkg/client/v1/aicr.go @@ -30,7 +30,7 @@ // each component in a *RecipeResult. // - CollectSnapshot — deploy the snapshotter Job and retrieve a *Snapshot. // - ValidateState — evaluate a resolved recipe against a snapshot, -// running deployment / performance / conformance phases. +// running deployment / conformance / performance phases. // // All facade types (Snapshot, AgentConfig, Criteria, RecipeRequest, // RecipeResult, ComponentBundle, ComponentRef, PhaseResult, AllowLists) @@ -1100,7 +1100,7 @@ func (c *Client) CollectSnapshot(ctx context.Context, cfg *AgentConfig) (*Snapsh // ValidateState evaluates a resolved recipe against an observed cluster // snapshot, runs the selected validation phases (by default -// PhaseDeployment, PhasePerformance, PhaseConformance) in order, and +// PhaseDeployment, PhaseConformance, PhasePerformance) in order, and // returns one PhaseResult per phase run. Pass WithValidationPhases to // restrict the run to a subset. // diff --git a/pkg/client/v1/aicr_test.go b/pkg/client/v1/aicr_test.go index 3168cc202..0e6766553 100644 --- a/pkg/client/v1/aicr_test.go +++ b/pkg/client/v1/aicr_test.go @@ -1297,7 +1297,7 @@ func TestValidateState_PhaseSelection(t *testing.T) { { name: "unset runs all phases", phases: nil, - want: []aicr.Phase{aicr.PhaseDeployment, aicr.PhasePerformance, aicr.PhaseConformance}, + want: []aicr.Phase{aicr.PhaseDeployment, aicr.PhaseConformance, aicr.PhasePerformance}, }, } diff --git a/pkg/evidence/attestation/types.go b/pkg/evidence/attestation/types.go index 4a70f5401..00a65a5a9 100644 --- a/pkg/evidence/attestation/types.go +++ b/pkg/evidence/attestation/types.go @@ -90,7 +90,11 @@ const ( PhaseConformance Phase = "conformance" ) -// AllPhases is the canonical iteration order for deterministic output. +// AllPhases is the canonical iteration order for deterministic attestation +// output. It is intentionally fixed and independent of the validator's +// execution order (pkg/validator.PhaseOrder, which runs performance last) — +// freezing it keeps attestation predicate bytes reproducible across releases +// regardless of execution-order changes. var AllPhases = []Phase{PhaseDeployment, PhasePerformance, PhaseConformance} // Predicate is the body of the signed in-toto Statement. It serializes diff --git a/pkg/evidence/cncf/scripts/collect-evidence.sh b/pkg/evidence/cncf/scripts/collect-evidence.sh index 200ab03cf..8d59c8513 100755 --- a/pkg/evidence/cncf/scripts/collect-evidence.sh +++ b/pkg/evidence/cncf/scripts/collect-evidence.sh @@ -242,7 +242,7 @@ EOF Deploy a test pod that requests 1 GPU via ResourceClaim and verifies device access. -**Test manifest:** `pkg/evidence/scripts/manifests/dra-gpu-test.yaml` +**Test manifest:** `pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml` EOF echo '```yaml' >> "${EVIDENCE_FILE}" cat "${SCRIPT_DIR}/manifests/dra-gpu-test.yaml" >> "${EVIDENCE_FILE}" @@ -311,7 +311,7 @@ EOF Deploy a PodGroup with minMember=2 and two GPU pods. KAI scheduler ensures both pods are scheduled atomically. -**Test manifest:** `pkg/evidence/scripts/manifests/gang-scheduling-test.yaml` +**Test manifest:** `pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml` EOF echo '```yaml' >> "${EVIDENCE_FILE}" cat "${SCRIPT_DIR}/manifests/gang-scheduling-test.yaml" >> "${EVIDENCE_FILE}" @@ -1824,7 +1824,7 @@ EOF Deploy a GPU workload running CUDA N-Body Simulation to generate sustained GPU utilization, then create an HPA targeting `gpu_utilization` to demonstrate autoscaling. -**Test manifest:** `pkg/evidence/scripts/manifests/hpa-gpu-test.yaml` +**Test manifest:** `pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml` EOF echo '```yaml' >> "${EVIDENCE_FILE}" cat "${SCRIPT_DIR}/manifests/hpa-gpu-test.yaml" >> "${EVIDENCE_FILE}" diff --git a/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml b/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml index fc9152d63..bb507f351 100644 --- a/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml +++ b/pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml @@ -13,7 +13,7 @@ # limitations under the License. # DRA GPU allocation test -# Usage: kubectl apply -f pkg/evidence/scripts/manifests/dra-gpu-test.yaml +# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml --- apiVersion: v1 kind: Namespace diff --git a/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml b/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml index 75732a8a9..bbdb258fb 100644 --- a/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml +++ b/pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml @@ -15,7 +15,7 @@ # Gang scheduling test with PodGroup, DRA ResourceClaims, and KAI scheduler. # Demonstrates all-or-nothing scheduling: both pods must be scheduled together. # Requires: KAI scheduler with PodGroup CRD, DRA driver (gpu.nvidia.com) -# Usage: kubectl apply -f pkg/evidence/scripts/manifests/gang-scheduling-test.yaml +# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/gang-scheduling-test.yaml --- apiVersion: v1 kind: Namespace diff --git a/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml b/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml index afdc04ebe..c06edde64 100644 --- a/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml +++ b/pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml @@ -14,7 +14,7 @@ # HPA Pod Autoscaling test with custom GPU metrics # Demonstrates HPA scaling based on gpu_utilization from prometheus-adapter -# Usage: kubectl apply -f pkg/evidence/scripts/manifests/hpa-gpu-test.yaml +# Usage: kubectl apply -f pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml --- apiVersion: v1 kind: Namespace diff --git a/pkg/validator/phases.go b/pkg/validator/phases.go index 35c0cfe8a..554a04c9e 100644 --- a/pkg/validator/phases.go +++ b/pkg/validator/phases.go @@ -40,9 +40,18 @@ const ( // All phases run by default; set Validator.FailFast to stop after the // first phase that reports StatusFailed. // +// Order rationale: deployment (cheap install/health checks) → conformance +// → performance. Performance runs LAST because its inference-perf benchmark +// saturates every GPU on the node and the DynamoGraphDeployment teardown +// releases those DRA ResourceClaims asynchronously; running it before +// conformance starved conformance's GPU-needing checks (e.g. dra-support, +// whose 1-GPU test pod failed "cannot allocate all claims" on single-node +// clusters). Running performance last also keeps a flaky perf phase from +// blocking conformance under FailFast. +// // Note: Readiness phase is NOT included. It remains in pkg/validator // and uses inline constraint evaluation (no containers). -var PhaseOrder = []Phase{PhaseDeployment, PhasePerformance, PhaseConformance} +var PhaseOrder = []Phase{PhaseDeployment, PhaseConformance, PhasePerformance} // PhaseAll is the wildcard string accepted by both the `aicr validate // --phase` CLI flag and the spec.validate.execution.phases config field diff --git a/pkg/validator/v1/job_plan.go b/pkg/validator/v1/job_plan.go index 1f58f8366..d25fc5aba 100644 --- a/pkg/validator/v1/job_plan.go +++ b/pkg/validator/v1/job_plan.go @@ -169,7 +169,7 @@ func Plan( } // Iterate through all phases - phases := []Phase{PhaseDeployment, PhasePerformance, PhaseConformance} + phases := []Phase{PhaseDeployment, PhaseConformance, PhasePerformance} for _, phase := range phases { // Get all entries for this phase allEntries := cat.ForPhase(phase) diff --git a/pkg/validator/validator_test.go b/pkg/validator/validator_test.go index 19b7530f8..ee2682dcb 100644 --- a/pkg/validator/validator_test.go +++ b/pkg/validator/validator_test.go @@ -263,7 +263,10 @@ func TestCheckReadinessUnparseableConstraintFailsClosed(t *testing.T) { } func TestPhaseOrder(t *testing.T) { - expected := []Phase{PhaseDeployment, PhasePerformance, PhaseConformance} + // performance runs last: its benchmark saturates all node GPUs and releases + // DRA claims asynchronously, which would otherwise starve conformance's + // GPU-needing checks (e.g. dra-support). + expected := []Phase{PhaseDeployment, PhaseConformance, PhasePerformance} if len(PhaseOrder) != len(expected) { t.Fatalf("PhaseOrder length = %d, want %d", len(PhaseOrder), len(expected)) } diff --git a/validators/conformance/cluster_autoscaling_check.go b/validators/conformance/cluster_autoscaling_check.go index e0dbf962f..1903419f5 100644 --- a/validators/conformance/cluster_autoscaling_check.go +++ b/validators/conformance/cluster_autoscaling_check.go @@ -178,7 +178,7 @@ func CheckClusterAutoscaling(ctx *validators.Context) error { lastErr = validateErr if lastErr == nil { recordRawTextArtifact(ctx, "Apply test manifest", - "kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-scale-test.yaml", + "# cluster-autoscaling test resources (HPA + GPU deployment + Karpenter NodePool) constructed via the Kubernetes API; no static manifest", fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s for nodePool=%s", report.Namespace, report.DeploymentName, report.HPAName, report.NodePoolName)) recordRawTextArtifact(ctx, "Cluster Autoscaling Behavioral Test", diff --git a/validators/conformance/dra_support_check.go b/validators/conformance/dra_support_check.go index ffbbbcf45..9b1dfa8aa 100644 --- a/validators/conformance/dra_support_check.go +++ b/validators/conformance/dra_support_check.go @@ -146,7 +146,7 @@ func validateDRAAllocation(ctx *validators.Context, dynClient dynamic.Interface) return err } recordRawTextArtifact(ctx, "Apply test manifest", - "kubectl apply -f docs/conformance/cncf/manifests/dra-gpu-test.yaml", + "kubectl apply -f pkg/evidence/cncf/scripts/manifests/dra-gpu-test.yaml", fmt.Sprintf("Created Namespace=%s ResourceClaim=%s Pod=%s via Kubernetes API", draTestNamespace, run.claimName, run.podName)) diff --git a/validators/conformance/pod_autoscaling_check.go b/validators/conformance/pod_autoscaling_check.go index bf94a0808..d301ada0c 100644 --- a/validators/conformance/pod_autoscaling_check.go +++ b/validators/conformance/pod_autoscaling_check.go @@ -185,7 +185,7 @@ func CheckPodAutoscaling(ctx *validators.Context) error { return err } recordRawTextArtifact(ctx, "Apply test manifest", - "kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-test.yaml", + "kubectl apply -f pkg/evidence/cncf/scripts/manifests/hpa-gpu-test.yaml", fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s via Kubernetes API", hpaReport.Namespace, hpaReport.DeploymentName, hpaReport.HPAName)) recordRawTextArtifact(ctx, "HPA Behavioral Test",