From b394e66fe9916d3574840279ea6ced641b22e5fe Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 8 Dec 2025 04:22:08 +0000 Subject: [PATCH 01/30] Add image pull scenario prototype --- .gitignore | 2 + .../image_pull/analyze_results.py | 103 +++++ .../clusterloader2/image_pull/run_test.py | 125 ++++++ modules/python/clusterloader2/utils.py | 5 +- scenarios/perf-eval/image-pull-test/README.md | 102 +++++ .../image-pull-test/analyze_results.sh | 18 + .../containerd-measurements.yaml | 78 ++++ .../perf-eval/image-pull-test/deployment.yaml | 31 ++ .../perf-eval/image-pull-test/image-pull.yaml | 71 +++ .../perf-eval/image-pull-test/run_cl2.sh | 29 ++ .../image-pull-test/run_locally.ipynb | 411 ++++++++++++++++++ .../terraform-inputs/azure.tfvars | 27 ++ .../terraform-test-inputs/azure.json | 12 + 13 files changed, 1013 insertions(+), 1 deletion(-) create mode 100644 modules/python/clusterloader2/image_pull/analyze_results.py create mode 100644 modules/python/clusterloader2/image_pull/run_test.py create mode 100644 scenarios/perf-eval/image-pull-test/README.md create mode 100755 scenarios/perf-eval/image-pull-test/analyze_results.sh create mode 100644 scenarios/perf-eval/image-pull-test/containerd-measurements.yaml create mode 100644 scenarios/perf-eval/image-pull-test/deployment.yaml create mode 100644 scenarios/perf-eval/image-pull-test/image-pull.yaml create mode 100755 scenarios/perf-eval/image-pull-test/run_cl2.sh create mode 100644 scenarios/perf-eval/image-pull-test/run_locally.ipynb create mode 100644 scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json diff --git a/.gitignore b/.gitignore index 4ad23c88c4..bd3407232e 100644 --- a/.gitignore +++ b/.gitignore @@ -76,6 +76,8 @@ env/ venv/ modules/python/clusterloader2/*/results modules/python/clusterloader2/*/config/overrides.yaml +scenarios/**/results +scenarios/**/cl2-config #Jupyter *-checkpoint.ipynb diff --git a/modules/python/clusterloader2/image_pull/analyze_results.py b/modules/python/clusterloader2/image_pull/analyze_results.py new file mode 100644 index 0000000000..3085581a53 --- /dev/null +++ b/modules/python/clusterloader2/image_pull/analyze_results.py @@ -0,0 +1,103 @@ +"""Analyze ClusterLoader2 image-pull test results.""" + +import json +import sys +from pathlib import Path +import xml.etree.ElementTree as ET + +try: + import pandas as pd + HAS_PANDAS = True +except ImportError: + HAS_PANDAS = False + + +def _load_json(results_dir: str, pattern: str) -> dict: + """Load most recent JSON matching pattern.""" + files = sorted(Path(results_dir).glob(pattern), key=lambda x: x.stat().st_mtime, reverse=True) + return json.load(open(files[0])) if files else {} + + +def _print(data: list, cols: list = None): + """Print data as table.""" + if not data: + return + if HAS_PANDAS: + df = pd.DataFrame(data) + print(df.to_string(index=False) if not cols else df[cols].to_string(index=False)) + else: + for row in data: + print(" " + ", ".join(f"{k}={v}" for k, v in row.items())) + + +def analyze_results(results_dir: str) -> dict: + """Analyze test results and return metrics.""" + results = Path(results_dir) + if not results.exists(): + raise FileNotFoundError(f"Not found: {results_dir}") + + print(f"\n{'='*60}") + print(f"Results: {results_dir}") + print('='*60) + + # Pod startup latency + data = _load_json(results_dir, "PodStartupLatency_*.json") + if items := data.get('dataItems'): + print("\nPod Startup Latency:") + _print([{ + 'Metric': i['labels']['Metric'], + 'P50': f"{i['data']['Perc50']:.0f}ms", + 'P90': f"{i['data']['Perc90']:.0f}ms", + 'P99': f"{i['data']['Perc99']:.0f}ms" + } for i in items]) + + # Image pull throughput + data = _load_json(results_dir, "*ContainerdCriImagePullingThroughput_*.json") + if items := data.get('dataItems'): + print("\nImage Pulling Throughput:") + for i in items: + d = i.get('data', {}) + if s := d.get('Sum'): + print(f" {s:.2f} {i.get('unit', '')} total ({d.get('Count', 0)} pulls)") + + # Kubelet image pull duration + data = _load_json(results_dir, "*KubeletRuntimeOperationDurationWithPullImage_*.json") + if items := data.get('dataItems'): + print("\nKubelet Image Pull Duration (per node):") + nodes = [{ + 'Node': i['labels']['node'][-8:], # Last 8 chars of node name + 'P50': f"{i['data']['Perc50']:.1f}s", + 'P90': f"{i['data']['Perc90']:.1f}s", + 'P99': f"{i['data']['Perc99']:.1f}s" + } for i in items if i.get('labels', {}).get('node') and 'Perc50' in i.get('data', {})] + _print(nodes) + + # Test status + junit = results / 'junit.xml' + if junit.exists(): + tree = ET.parse(junit) + failures = int(tree.getroot().get('failures', 0)) + errors = int(tree.getroot().get('errors', 0)) + status = 'PASS' if failures == 0 and errors == 0 else 'FAIL' + print(f"\nTest Status: {status}") + + print('='*60) + return {'status': status if junit.exists() else 'unknown'} + + +def main(): + """CLI entry point.""" + if len(sys.argv) < 2: + print("Usage: analyze_results.py ") + sys.exit(1) + + try: + result = analyze_results(sys.argv[1]) + sys.exit(0 if result.get('status') == 'PASS' else 1) + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/modules/python/clusterloader2/image_pull/run_test.py b/modules/python/clusterloader2/image_pull/run_test.py new file mode 100644 index 0000000000..139ac57ba1 --- /dev/null +++ b/modules/python/clusterloader2/image_pull/run_test.py @@ -0,0 +1,125 @@ +"""Run ClusterLoader2 image-pull test.""" + +import os +import sys +import shutil +import logging +from pathlib import Path + + +def _copy_files(src_files: list, src_dir: Path, dst_dir: Path) -> None: + """Copy multiple files from src to dst directory.""" + dst_dir.mkdir(parents=True, exist_ok=True) + for f in src_files: + src = src_dir / f if isinstance(f, str) else f + if src.exists(): + shutil.copy(src, dst_dir / src.name) + print(f" - {src.name}") + + +def setup_config_files(scenario_dir: Path, cl2_config_dir: Path, root_dir: Path) -> None: + """Copy configuration files for the test.""" + print(f"Setting up config in {cl2_config_dir}...") + + # Copy scenario files + _copy_files(['image-pull.yaml', 'deployment.yaml', 'containerd-measurements.yaml'], + scenario_dir, cl2_config_dir) + + # Copy kubelet measurements from modules + kubelet_src = root_dir / 'modules/python/clusterloader2/cri/config/kubelet-measurement.yaml' + _copy_files([kubelet_src], root_dir, cl2_config_dir) + + +def run_cl2_test( + kubeconfig: str, + root_dir: str, + scenario_name: str = 'image-pull-test', + cl2_image: str = 'ghcr.io/azure/clusterloader2:v20250311', + prometheus_memory: str = '2Gi', + storage_provisioner: str = 'kubernetes.io/azure-disk', + storage_volume_type: str = 'StandardSSD_LRS' +) -> bool: + """Run ClusterLoader2 image-pull test.""" + try: + from clusterloader2.utils import run_cl2_command + except ImportError: + print("Error: Could not import clusterloader2.utils") + return False + + # Setup paths + root_path = Path(root_dir) + scenario_dir = root_path / 'scenarios/perf-eval' / scenario_name + cl2_config_dir = scenario_dir / 'cl2-config' + results_dir = scenario_dir / 'results' + results_dir.mkdir(parents=True, exist_ok=True) + + # Configure logging + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[logging.FileHandler(results_dir / 'cl2.log', mode='w'), logging.StreamHandler()] + ) + + try: + setup_config_files(scenario_dir, cl2_config_dir, root_path) + + print(f"\n{'='*60}") + print(f"Starting ClusterLoader2 Test") + print(f"Results: {results_dir}") + print(f"{'='*60}\n") + + run_cl2_command( + kubeconfig=kubeconfig, + cl2_image=cl2_image, + cl2_config_dir=str(cl2_config_dir), + cl2_report_dir=str(results_dir), + provider='aks', + cl2_config_file='image-pull.yaml', + enable_prometheus=True, + scrape_kubelets=True, + scrape_containerd=True, + tear_down_prometheus=False, + extra_flags=f"--prometheus-memory-request={prometheus_memory} " + f"--prometheus-storage-class-provisioner={storage_provisioner} " + f"--prometheus-storage-class-volume-type={storage_volume_type}" + ) + + print(f"\nTest completed - Results in: {results_dir}") + return True + + except Exception as e: + print(f"Error: {e}") + return False + + +def main(): + """CLI entry point.""" + import argparse + + parser = argparse.ArgumentParser(description='Run ClusterLoader2 image-pull test') + parser.add_argument('--kubeconfig', default=os.path.expanduser('~/.kube/config')) + parser.add_argument('--root-dir', default=os.environ.get('ROOT_DIR', os.getcwd())) + parser.add_argument('--scenario', default='image-pull-test') + parser.add_argument('--cl2-image', default='ghcr.io/azure/clusterloader2:v20250311') + parser.add_argument('--prometheus-memory', default='2Gi') + parser.add_argument('--storage-provisioner', default='kubernetes.io/azure-disk') + parser.add_argument('--storage-volume-type', default='StandardSSD_LRS') + + args = parser.parse_args() + sys.path.insert(0, os.path.join(args.root_dir, 'modules/python')) + + success = run_cl2_test( + kubeconfig=args.kubeconfig, + root_dir=args.root_dir, + scenario_name=args.scenario, + cl2_image=args.cl2_image, + prometheus_memory=args.prometheus_memory, + storage_provisioner=args.storage_provisioner, + storage_volume_type=args.storage_volume_type + ) + + sys.exit(0 if success else 1) + + +if __name__ == '__main__': + main() diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index 8212b5ae7f..df5f582e54 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -25,7 +25,7 @@ def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file="config.yaml", overrides=False, enable_prometheus=False, tear_down_prometheus=True, enable_exec_service=False, scrape_kubelets=False, - scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False): + scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False, extra_flags=""): docker_client = DockerClient() command = f"""--provider={provider} --v=2 @@ -45,6 +45,9 @@ def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provi if overrides: command += " --testoverrides=/root/perf-tests/clusterloader2/config/overrides.yaml" + if extra_flags: + command += f" {extra_flags}" + volumes = { kubeconfig: {'bind': '/root/.kube/config', 'mode': 'rw'}, cl2_config_dir: {'bind': '/root/perf-tests/clusterloader2/config', 'mode': 'rw'}, diff --git a/scenarios/perf-eval/image-pull-test/README.md b/scenarios/perf-eval/image-pull-test/README.md new file mode 100644 index 0000000000..1edae1a049 --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/README.md @@ -0,0 +1,102 @@ +# Image Pull Performance Test + +## Overview + +Measures container image pull performance on AKS clusters using ClusterLoader2. + +## Test Scenario + +Creates 10 Deployments with 1 replica each (10 pods total), pulling a large container image to measure: +- How fast images are pulled across cluster nodes +- Pod startup latency when pulling large images +- Containerd throughput during parallel image pulls + +### Default Configuration + +| Parameter | Value | +|-----------|-------| +| Deployments | 10 | +| Replicas per deployment | 1 | +| Total pods | 10 | +| QPS (deployment creation rate) | 10 | +| Pod startup timeout | 3 minutes | +| Metrics collection wait | 10 minutes | +| Test image | pytorch-large:2.0.0 (~15GB) | + +To modify, edit `image-pull.yaml`: +- `replicasPerNamespace`: Number of deployments +- `Replicas`: Pods per deployment +- `qps`: Deployment creation rate + +## Metrics Collected + +| Metric | Source | Description | +|--------|--------|-------------| +| Kubelet Image Pull Duration | kubelet:10250 | P50/P90/P99 latency per node | +| Containerd Throughput | containerd:10257 | MB/s, total data, pull count | +| Network Plugin Operations | containerd:10257 | Pod network setup/teardown time | +| Pod Startup Latency | API server | End-to-end pod scheduling time | + +## Prerequisites + +- AKS cluster with containerd runtime +- Azure Container Registry with test image +- kubectl, terraform, az CLI, docker + +## Configuration + +### 1. Set your container image + +Edit `image-pull.yaml` line 37: +```yaml +Image: .azurecr.io/: +``` + +### 2. Set your ACR (in notebook) + +Edit `run_locally.ipynb` cell 9: +```bash +export ACR_NAME= +export ACR_SUBSCRIPTION_ID= # if different from AKS subscription +``` + +### 3. Attach ACR to AKS + +The notebook handles this automatically, or run manually: +```bash +az aks update -g -n --attach-acr +``` + +## Usage + +### Run via Notebook +```bash +# Open and run cells sequentially +jupyter notebook run_locally.ipynb +``` + +### Run via CLI +```bash +export ROOT_DIR=$(git rev-parse --show-toplevel) +./run_cl2.sh # Run test +./analyze_results.sh # Analyze results +``` + +## Files + +| File | Purpose | +|------|---------| +| `image-pull.yaml` | CL2 test config - defines workload and measurements | +| `deployment.yaml` | Pod template for image pull test | +| `containerd-measurements.yaml` | Prometheus queries for containerd metrics | +| `run_cl2.sh` | Shell wrapper to run test | +| `analyze_results.sh` | Shell wrapper to analyze results | +| `run_locally.ipynb` | Interactive notebook for local testing | +| `terraform-inputs/azure.tfvars` | AKS cluster configuration | + +## Output + +Results are written to `results/` directory: +- `junit.xml` - Test pass/fail status +- `PodStartupLatency_*.json` - Pod startup metrics +- `GenericPrometheusQuery_*.json` - Prometheus metric snapshots diff --git a/scenarios/perf-eval/image-pull-test/analyze_results.sh b/scenarios/perf-eval/image-pull-test/analyze_results.sh new file mode 100755 index 0000000000..85bc7f5904 --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/analyze_results.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# +# Analyze ClusterLoader2 results +# + +set -e + +if [ -z "$ROOT_DIR" ]; then + echo "Error: ROOT_DIR is not set. Please run the setup cell first." + exit 1 +fi + +export PYTHONPATH="${ROOT_DIR}/modules/python:${PYTHONPATH}" +RESULTS_DIR="${ROOT_DIR}/scenarios/perf-eval/image-pull-test/results" + +python3 -m clusterloader2.image_pull.analyze_results "$RESULTS_DIR" "$@" + +exit $? diff --git a/scenarios/perf-eval/image-pull-test/containerd-measurements.yaml b/scenarios/perf-eval/image-pull-test/containerd-measurements.yaml new file mode 100644 index 0000000000..0d559a7bfb --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/containerd-measurements.yaml @@ -0,0 +1,78 @@ +{{$action := .action}} # start, gather + +steps: + - name: {{$action}} Containerd Measurements + measurements: + # ContainerdCriImagePullingThroughput - WORKS (has histogram buckets) + - identifier: ContainerdCriImagePullingThroughput + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriImagePullingThroughput + metricVersion: v1 + unit: MB/s + queries: + - name: Perc100 + query: histogram_quantile(1, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Sum + query: sum(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) + - name: Average + query: sum(rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])) + + # ContainerdCriNetworkPluginOperations - Sum/Count only (histograms may not work) + - identifier: ContainerdCriNetworkPluginOperations + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriNetworkPluginOperations + metricVersion: v1 + unit: s + dimensions: + - operation_type + queries: + - name: Sum + query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}) by (operation_type) + - name: Count + query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}) by (operation_type) + - name: Average + query: sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}[%v])) by (operation_type) / sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}[%v])) by (operation_type) + + # ContainerdCriSandboxCreateNetwork - Sum/Count only + - identifier: ContainerdCriSandboxCreateNetwork + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriSandboxCreateNetwork + metricVersion: v1 + unit: s + queries: + - name: Sum + query: sum(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}) + - name: Average + query: sum(rate(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}[%v])) + + # ContainerdCriSandboxDeleteNetwork - Sum/Count only + - identifier: ContainerdCriSandboxDeleteNetwork + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriSandboxDeleteNetwork + metricVersion: v1 + unit: s + queries: + - name: Sum + query: sum(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}) + - name: Average + query: sum(rate(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}[%v])) diff --git a/scenarios/perf-eval/image-pull-test/deployment.yaml b/scenarios/perf-eval/image-pull-test/deployment.yaml new file mode 100644 index 0000000000..a355709f01 --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/deployment.yaml @@ -0,0 +1,31 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + group: {{.Group}} + template: + metadata: + labels: + group: {{.Group}} + spec: + containers: + - name: test-container + image: {{.Image}} + imagePullPolicy: Always + command: ["sleep"] + args: ["3600"] + resources: + requests: + cpu: 100m + memory: 100Mi + readinessProbe: + exec: + command: ["true"] + initialDelaySeconds: 5 + periodSeconds: 10 diff --git a/scenarios/perf-eval/image-pull-test/image-pull.yaml b/scenarios/perf-eval/image-pull-test/image-pull.yaml new file mode 100644 index 0000000000..0e7c3e5ca5 --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/image-pull.yaml @@ -0,0 +1,71 @@ +name: image-pull-test +namespace: + number: 1 +tuningSets: +- name: UniformQPS + qpsLoad: + qps: 10 +steps: +- measurements: + - identifier: PodStartupLatency + method: PodStartupLatency + params: + action: start + labelSelector: group = image-pull + threshold: 3m +- module: + path: containerd-measurements.yaml + params: + action: start +- module: + path: kubelet-measurement.yaml + params: + action: start +- name: Start deployment + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 10 + tuningSet: UniformQPS + objectBundle: + - basename: image-pull-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + Replicas: 1 + Group: image-pull + Image: .azurecr.io/: # TODO: Replace with your image +- name: Wait for pods to start and metrics to be collected + measurements: + - identifier: WaitForRunningDeployments + method: WaitForControlledPodsRunning + params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = image-pull + operationTimeout: 5m +- measurements: + - identifier: WaitForRunningDeployments + method: WaitForControlledPodsRunning + params: + action: gather +- name: Wait for containerd metrics to accumulate (histogram buckets need multiple scrapes) + measurements: + - identifier: Sleep + method: Sleep + params: + duration: 10m +- module: + path: containerd-measurements.yaml + params: + action: gather +- module: + path: kubelet-measurement.yaml + params: + action: gather +- measurements: + - identifier: PodStartupLatency + method: PodStartupLatency + params: + action: gather diff --git a/scenarios/perf-eval/image-pull-test/run_cl2.sh b/scenarios/perf-eval/image-pull-test/run_cl2.sh new file mode 100755 index 0000000000..8d0296282b --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/run_cl2.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Run ClusterLoader2 image-pull test +# + +set -e + +if [ -z "$ROOT_DIR" ]; then + echo "Error: ROOT_DIR is not set. Please run the setup cell first." + exit 1 +fi + +# Install docker for root (required for sudo access to docker socket) +echo "Ensuring 'docker' library is installed for root..." +sudo python3 -m pip install docker >/dev/null 2>&1 + +export KUBECONFIG_PATH=${KUBECONFIG:-$HOME/.kube/config} +export PYTHONPATH="$ROOT_DIR/modules/python:$PYTHONPATH" + +sudo -E PYTHONPATH="$PYTHONPATH" python3 -m clusterloader2.image_pull.run_test \ + --kubeconfig "$KUBECONFIG_PATH" \ + --root-dir "$ROOT_DIR" \ + --scenario "image-pull-test" \ + --cl2-image "ghcr.io/azure/clusterloader2:v20250311" \ + --prometheus-memory "2Gi" \ + --storage-provisioner "kubernetes.io/azure-disk" \ + --storage-volume-type "StandardSSD_LRS" + +exit $? diff --git a/scenarios/perf-eval/image-pull-test/run_locally.ipynb b/scenarios/perf-eval/image-pull-test/run_locally.ipynb new file mode 100644 index 0000000000..3777cd34bd --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/run_locally.ipynb @@ -0,0 +1,411 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c5dfa25a", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Check prerequisites\n", + "echo \"Checking prerequisites...\"\n", + "echo \"===========================================\"\n", + "\n", + "command -v terraform >/dev/null && terraform version || echo 'terraform missing'\n", + "command -v az >/dev/null && az version --output table || echo 'azure cli missing'\n", + "command -v jq >/dev/null && jq --version || echo 'jq missing'\n", + "command -v kubectl >/dev/null && kubectl version --client || echo 'kubectl missing'\n", + "\n", + "# Set ROOT_DIR to the repository root\n", + "if git rev-parse --show-toplevel >/dev/null 2>&1; then\n", + " export ROOT_DIR=$(git rev-parse --show-toplevel)\n", + "else\n", + " # Fallback: go up 3 levels from scenarios/perf-eval/image-pull-test\n", + " export ROOT_DIR=$(cd ../../.. && pwd)\n", + "fi\n", + "\n", + "echo \"Repository Root: $ROOT_DIR\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c6fa2de", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Install Python dependencies\n", + "if [ -z \"$ROOT_DIR\" ]; then\n", + " echo \"Error: ROOT_DIR is not set. Please run the first cell to initialize variables.\"\n", + " exit 1\n", + "fi\n", + "\n", + "echo \"Installing python requirements...\"\n", + "python3 -m pip install --user -r $ROOT_DIR/modules/python/requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50c0917f", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Define test scenario variables\n", + "export SCENARIO_TYPE=perf-eval\n", + "export SCENARIO_NAME=image-pull-test\n", + "export OWNER=$(whoami)\n", + "export RUN_ID=${RUN_ID:-$(date +%s)}\n", + "export CLOUD=azure\n", + "export REGION=eastus2\n", + "export AZURE_SUBSCRIPTION_ID=\"c0d4b923-b5ea-4f8f-9b56-5390a9bf2248\"\n", + "export SKU_TIER=Standard\n", + "export KUBERNETES_VERSION=1.31\n", + "export NETWORK_POLICY=\"\"\n", + "export NETWORK_DATAPLANE=azure\n", + "export TERRAFORM_MODULES_DIR=$ROOT_DIR/modules/terraform/$CLOUD\n", + "export TERRAFORM_INPUT_FILE=$ROOT_DIR/scenarios/$SCENARIO_TYPE/$SCENARIO_NAME/terraform-inputs/${CLOUD}.tfvars\n", + "export SYSTEM_NODE_POOL=null\n", + "export USER_NODE_POOL=null\n", + "\n", + "echo \"Scenario: $SCENARIO_TYPE/$SCENARIO_NAME\"\n", + "echo \"Run ID: $RUN_ID\"\n", + "echo \"Terraform Input: $TERRAFORM_INPUT_FILE\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae75239f", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Azure login\n", + "echo \"Azure Authentication\"\n", + "if az account show >/dev/null 2>&1; then\n", + " echo \"Already logged in\"\n", + " az account set -s $AZURE_SUBSCRIPTION_ID\n", + "else\n", + " echo \"Logging into Azure...\"\n", + " az login --use-device-code\n", + " az account set -s $AZURE_SUBSCRIPTION_ID\n", + "fi\n", + "export ARM_SUBSCRIPTION_ID=$(az account show --query id -o tsv)\n", + "export ARM_TENANT_ID=$(az account show --query tenantId -o tsv)\n", + "az account show --query '{Name:name, Id:id}' --output table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f69d63b", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Create resource group\n", + "echo \"Creating resource group $RUN_ID in $REGION\"\n", + "az group create --name $RUN_ID --location $REGION --tags run_id=$RUN_ID scenario=${SCENARIO_TYPE}-${SCENARIO_NAME} owner=$OWNER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ce7afe4", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Prepare Terraform input JSON\n", + "echo \"Preparing Terraform input JSON\"\n", + "\n", + "# Ensure variables are set to defaults if empty to avoid jq errors\n", + ": ${NETWORK_POLICY:=\"\"}\n", + ": ${NETWORK_DATAPLANE:=\"\"}\n", + ": ${SYSTEM_NODE_POOL:=\"null\"}\n", + ": ${USER_NODE_POOL:=\"null\"}\n", + "\n", + "export INPUT_JSON=$(jq -n \\\n", + " --arg run_id \"$RUN_ID\" \\\n", + " --arg region \"$REGION\" \\\n", + " --arg aks_sku_tier \"$SKU_TIER\" \\\n", + " --arg aks_kubernetes_version \"$KUBERNETES_VERSION\" \\\n", + " --arg aks_network_policy \"$NETWORK_POLICY\" \\\n", + " --arg aks_network_dataplane \"$NETWORK_DATAPLANE\" \\\n", + " --arg k8s_machine_type \"Standard_D4s_v3\" \\\n", + " --arg k8s_os_disk_type \"Managed\" \\\n", + " --argjson aks_cli_system_node_pool \"$SYSTEM_NODE_POOL\" \\\n", + " --argjson aks_cli_user_node_pool \"$USER_NODE_POOL\" \\\n", + " '{run_id:$run_id,region:$region,aks_sku_tier:$aks_sku_tier,aks_kubernetes_version:$aks_kubernetes_version,aks_network_policy:$aks_network_policy,aks_network_dataplane:$aks_network_dataplane,k8s_machine_type:$k8s_machine_type,k8s_os_disk_type:$k8s_os_disk_type,aks_cli_system_node_pool:$aks_cli_system_node_pool,aks_cli_user_node_pool:$aks_cli_user_node_pool}' | jq 'with_entries(select(.value != null and .value != \"\"))')\n", + "echo $INPUT_JSON | jq ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a78b6c4b", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Terraform init & plan\n", + "pushd $TERRAFORM_MODULES_DIR\n", + "terraform init\n", + "terraform plan -var json_input=\"$(echo $INPUT_JSON | jq -c .)\" -var-file $TERRAFORM_INPUT_FILE\n", + "popd\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75097d61", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Terraform apply\n", + "pushd $TERRAFORM_MODULES_DIR\n", + "terraform apply -var json_input=\"$(echo $INPUT_JSON | jq -c .)\" -var-file $TERRAFORM_INPUT_FILE --auto-approve\n", + "popd\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79bf5007", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Attach ACR permissions so AKS can pull private images\n", + "# TODO: Set your ACR name and subscription ID\n", + "export ACR_NAME=${ACR_NAME:-}\n", + "export ACR_SUBSCRIPTION_ID=${ACR_SUBSCRIPTION_ID:-}\n", + "\n", + "# Automatically find the cluster name in the resource group\n", + "export CLUSTER_NAME=$(az aks list --resource-group $RUN_ID --query \"[0].name\" -o tsv)\n", + "\n", + "if [ -z \"$CLUSTER_NAME\" ]; then\n", + " echo \"Error: No AKS cluster found in resource group $RUN_ID\"\n", + " exit 1\n", + "fi\n", + "\n", + "# Resolve ACR Resource ID (Required for cross-subscription attach)\n", + "if [ -n \"$ACR_SUBSCRIPTION_ID\" ]; then\n", + " echo \"Looking up ACR '$ACR_NAME' in subscription '$ACR_SUBSCRIPTION_ID'...\"\n", + " ACR_ID=$(az acr show --name $ACR_NAME --subscription $ACR_SUBSCRIPTION_ID --query id -o tsv)\n", + "else\n", + " echo \"Looking up ACR '$ACR_NAME' in current subscription...\"\n", + " ACR_ID=$(az acr show --name $ACR_NAME --query id -o tsv 2>/dev/null)\n", + "fi\n", + "\n", + "if [ -z \"$ACR_ID\" ]; then\n", + " echo \"Warning: Could not find ACR ID. Attempting to attach by name...\"\n", + " ACR_ID=$ACR_NAME\n", + "else\n", + " echo \"Found ACR ID: $ACR_ID\"\n", + "fi\n", + "\n", + "echo \"Attaching registry to cluster $CLUSTER_NAME...\"\n", + "az aks update --resource-group $RUN_ID --name $CLUSTER_NAME --attach-acr $ACR_ID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2735b27", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Get kubeconfig\n", + "if [ -z \"$CLUSTER_NAME\" ]; then\n", + " export CLUSTER_NAME=$(az aks list --resource-group $RUN_ID --query \"[0].name\" -o tsv)\n", + "fi\n", + "\n", + "echo \"Getting credentials for $CLUSTER_NAME...\"\n", + "az aks get-credentials --resource-group $RUN_ID --name $CLUSTER_NAME --overwrite-existing\n", + "kubectl get nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96222b71", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Clean up previous Prometheus resources\n", + "echo \"Removing stale monitoring.coreos.com resources\"\n", + "for crd in alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com prometheuses.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com probes.monitoring.coreos.com; do\n", + " kubectl delete crd $crd --ignore-not-found\n", + "done\n", + "for cr in prometheus-operator prometheus-operator-psp prometheus-operator-cm; do\n", + " kubectl delete clusterrole $cr --ignore-not-found\n", + " kubectl delete clusterrolebinding $cr --ignore-not-found\n", + "done" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82504470", + "metadata": { + "language": "python", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Run ClusterLoader2 image-pull scenario\n", + "$ROOT_DIR/scenarios/perf-eval/image-pull-test/run_cl2.sh\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d36979ac", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Show result files\n", + "ls -lah $ROOT_DIR/scenarios/perf-eval/image-pull-test/results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8b67a1f", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Debug: Check if Prometheus has the raw containerd metrics\n", + "echo \"Checking Prometheus for containerd histogram metrics...\"\n", + "\n", + "# Port-forward to Prometheus\n", + "kubectl port-forward -n monitoring svc/prometheus-operated 9090:9090 &\n", + "PF_PID=$!\n", + "sleep 3\n", + "\n", + "echo \"\"\n", + "echo \"=== containerd_cri_image_pull_duration_seconds_bucket ===\"\n", + "curl -s 'http://localhost:9090/api/v1/query?query=containerd_cri_image_pull_duration_seconds_bucket' | jq -r '.data.result | length' | xargs -I {} echo \"Found {} time series\"\n", + "\n", + "echo \"\"\n", + "echo \"=== kubelet_runtime_operations_duration_seconds_bucket ===\"\n", + "curl -s 'http://localhost:9090/api/v1/query?query=kubelet_runtime_operations_duration_seconds_bucket{operation_type=\"pull_image\"}' | jq -r '.data.result | length' | xargs -I {} echo \"Found {} time series\"\n", + "\n", + "# Kill port-forward\n", + "kill $PF_PID 2>/dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d986df0a", + "metadata": { + "language": "shellscript", + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Cleanup resources\n", + "pushd $TERRAFORM_MODULES_DIR\n", + "terraform destroy -var json_input=\"$(echo $INPUT_JSON | jq -c .)\" -var-file $TERRAFORM_INPUT_FILE --auto-approve\n", + "popd\n", + "az group delete --name $RUN_ID -y\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b31c9c8", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# Analyze Results\n", + "$ROOT_DIR/scenarios/perf-eval/image-pull-test/analyze_results.sh\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Bash", + "language": "bash", + "name": "bash" + }, + "language_info": { + "codemirror_mode": "shell", + "file_extension": ".sh", + "mimetype": "text/x-sh", + "name": "bash" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..704d049c4c --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars @@ -0,0 +1,27 @@ +scenario_type = "perf-eval" +scenario_name = "image-pull-test" +deletion_delay = "2h" +owner = "telescope" + +aks_config_list = [ + { + role = "client" + aks_name = "img-pull-10" + dns_prefix = "kperf" + subnet_name = "aks-network" + sku_tier = "Standard" + network_profile = { + network_plugin = "azure" + network_plugin_mode = "overlay" + } + default_node_pool = { + name = "userpool0" + node_count = 10 + vm_size = "Standard_D4s_v3" + os_disk_type = "Managed" + only_critical_addons_enabled = false + temporary_name_for_rotation = "temp" + } + extra_node_pool = [] + } +] diff --git a/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json b/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json new file mode 100644 index 0000000000..9d4a536039 --- /dev/null +++ b/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json @@ -0,0 +1,12 @@ +{ + "run_id": "test-run", + "region": "eastus2", + "aks_sku_tier": "Standard", + "aks_kubernetes_version": "1.29", + "aks_network_policy": "none", + "aks_network_dataplane": "azure", + "k8s_machine_type": "Standard_D4s_v3", + "k8s_os_disk_type": "Managed", + "aks_cli_system_node_pool": null, + "aks_cli_user_node_pool": null +} From 695a5d682b3382d1b53af18cdd03e75b6b8d9e95 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Wed, 10 Dec 2025 03:11:45 +0000 Subject: [PATCH 02/30] Restructure image-pull scenario for ADO pipeline execution --- .../image_pull/analyze_results.py | 103 ----- .../config/containerd-measurements.yaml | 78 ++++ .../image_pull/config}/deployment.yaml | 0 .../image_pull/config}/image-pull.yaml | 2 +- .../config/kubelet-measurement.yaml | 108 +++++ .../clusterloader2/image_pull/image_pull.py | 172 ++++++++ .../clusterloader2/image_pull/run_test.py | 125 ------ modules/python/clusterloader2/utils.py | 5 +- modules/python/tests/test_image_pull.py | 198 +++++++++ pipelines/system/new-pipeline-test.yml | 35 +- scenarios/perf-eval/image-pull-test/README.md | 88 +--- .../image-pull-test/analyze_results.sh | 18 - .../containerd-measurements.yaml | 78 ---- .../perf-eval/image-pull-test/run_cl2.sh | 29 -- .../image-pull-test/run_locally.ipynb | 411 ------------------ .../clusterloader2/image_pull/collect.yml | 32 ++ .../clusterloader2/image_pull/execute.yml | 34 ++ .../image-pull/collect-clusterloader2.yml | 17 + .../image-pull/execute-clusterloader2.yml | 17 + .../image-pull/validate-resources.yml | 19 + 20 files changed, 714 insertions(+), 855 deletions(-) delete mode 100644 modules/python/clusterloader2/image_pull/analyze_results.py create mode 100644 modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml rename {scenarios/perf-eval/image-pull-test => modules/python/clusterloader2/image_pull/config}/deployment.yaml (100%) rename {scenarios/perf-eval/image-pull-test => modules/python/clusterloader2/image_pull/config}/image-pull.yaml (94%) create mode 100644 modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml create mode 100644 modules/python/clusterloader2/image_pull/image_pull.py delete mode 100644 modules/python/clusterloader2/image_pull/run_test.py create mode 100644 modules/python/tests/test_image_pull.py delete mode 100755 scenarios/perf-eval/image-pull-test/analyze_results.sh delete mode 100644 scenarios/perf-eval/image-pull-test/containerd-measurements.yaml delete mode 100755 scenarios/perf-eval/image-pull-test/run_cl2.sh delete mode 100644 scenarios/perf-eval/image-pull-test/run_locally.ipynb create mode 100644 steps/engine/clusterloader2/image_pull/collect.yml create mode 100644 steps/engine/clusterloader2/image_pull/execute.yml create mode 100644 steps/topology/image-pull/collect-clusterloader2.yml create mode 100644 steps/topology/image-pull/execute-clusterloader2.yml create mode 100644 steps/topology/image-pull/validate-resources.yml diff --git a/modules/python/clusterloader2/image_pull/analyze_results.py b/modules/python/clusterloader2/image_pull/analyze_results.py deleted file mode 100644 index 3085581a53..0000000000 --- a/modules/python/clusterloader2/image_pull/analyze_results.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Analyze ClusterLoader2 image-pull test results.""" - -import json -import sys -from pathlib import Path -import xml.etree.ElementTree as ET - -try: - import pandas as pd - HAS_PANDAS = True -except ImportError: - HAS_PANDAS = False - - -def _load_json(results_dir: str, pattern: str) -> dict: - """Load most recent JSON matching pattern.""" - files = sorted(Path(results_dir).glob(pattern), key=lambda x: x.stat().st_mtime, reverse=True) - return json.load(open(files[0])) if files else {} - - -def _print(data: list, cols: list = None): - """Print data as table.""" - if not data: - return - if HAS_PANDAS: - df = pd.DataFrame(data) - print(df.to_string(index=False) if not cols else df[cols].to_string(index=False)) - else: - for row in data: - print(" " + ", ".join(f"{k}={v}" for k, v in row.items())) - - -def analyze_results(results_dir: str) -> dict: - """Analyze test results and return metrics.""" - results = Path(results_dir) - if not results.exists(): - raise FileNotFoundError(f"Not found: {results_dir}") - - print(f"\n{'='*60}") - print(f"Results: {results_dir}") - print('='*60) - - # Pod startup latency - data = _load_json(results_dir, "PodStartupLatency_*.json") - if items := data.get('dataItems'): - print("\nPod Startup Latency:") - _print([{ - 'Metric': i['labels']['Metric'], - 'P50': f"{i['data']['Perc50']:.0f}ms", - 'P90': f"{i['data']['Perc90']:.0f}ms", - 'P99': f"{i['data']['Perc99']:.0f}ms" - } for i in items]) - - # Image pull throughput - data = _load_json(results_dir, "*ContainerdCriImagePullingThroughput_*.json") - if items := data.get('dataItems'): - print("\nImage Pulling Throughput:") - for i in items: - d = i.get('data', {}) - if s := d.get('Sum'): - print(f" {s:.2f} {i.get('unit', '')} total ({d.get('Count', 0)} pulls)") - - # Kubelet image pull duration - data = _load_json(results_dir, "*KubeletRuntimeOperationDurationWithPullImage_*.json") - if items := data.get('dataItems'): - print("\nKubelet Image Pull Duration (per node):") - nodes = [{ - 'Node': i['labels']['node'][-8:], # Last 8 chars of node name - 'P50': f"{i['data']['Perc50']:.1f}s", - 'P90': f"{i['data']['Perc90']:.1f}s", - 'P99': f"{i['data']['Perc99']:.1f}s" - } for i in items if i.get('labels', {}).get('node') and 'Perc50' in i.get('data', {})] - _print(nodes) - - # Test status - junit = results / 'junit.xml' - if junit.exists(): - tree = ET.parse(junit) - failures = int(tree.getroot().get('failures', 0)) - errors = int(tree.getroot().get('errors', 0)) - status = 'PASS' if failures == 0 and errors == 0 else 'FAIL' - print(f"\nTest Status: {status}") - - print('='*60) - return {'status': status if junit.exists() else 'unknown'} - - -def main(): - """CLI entry point.""" - if len(sys.argv) < 2: - print("Usage: analyze_results.py ") - sys.exit(1) - - try: - result = analyze_results(sys.argv[1]) - sys.exit(0 if result.get('status') == 'PASS' else 1) - except Exception as e: - print(f"Error: {e}") - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml b/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml new file mode 100644 index 0000000000..deb7e8e837 --- /dev/null +++ b/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml @@ -0,0 +1,78 @@ +--- +{{$action := .action}} +steps: + - name: {{$action}} Containerd Measurements + measurements: + # ContainerdCriImagePullingThroughput - WORKS (has histogram buckets) + - identifier: ContainerdCriImagePullingThroughput + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriImagePullingThroughput + metricVersion: v1 + unit: MB/s + queries: + - name: Perc100 + query: histogram_quantile(1, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Sum + query: sum(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) + - name: Average + query: sum(rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])) + + # ContainerdCriNetworkPluginOperations - Sum/Count only (histograms may not work) + - identifier: ContainerdCriNetworkPluginOperations + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriNetworkPluginOperations + metricVersion: v1 + unit: s + dimensions: + - operation_type + queries: + - name: Sum + query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}) by (operation_type) + - name: Count + query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}) by (operation_type) + - name: Average + query: sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}[%v])) by (operation_type) / sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}[%v])) by (operation_type) + + # ContainerdCriSandboxCreateNetwork - Sum/Count only + - identifier: ContainerdCriSandboxCreateNetwork + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriSandboxCreateNetwork + metricVersion: v1 + unit: s + queries: + - name: Sum + query: sum(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}) + - name: Average + query: sum(rate(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}[%v])) + + # ContainerdCriSandboxDeleteNetwork - Sum/Count only + - identifier: ContainerdCriSandboxDeleteNetwork + method: GenericPrometheusQuery + params: + action: {{$action}} + metricName: ContainerdCriSandboxDeleteNetwork + metricVersion: v1 + unit: s + queries: + - name: Sum + query: sum(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}) + - name: Average + query: sum(rate(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}[%v])) diff --git a/scenarios/perf-eval/image-pull-test/deployment.yaml b/modules/python/clusterloader2/image_pull/config/deployment.yaml similarity index 100% rename from scenarios/perf-eval/image-pull-test/deployment.yaml rename to modules/python/clusterloader2/image_pull/config/deployment.yaml diff --git a/scenarios/perf-eval/image-pull-test/image-pull.yaml b/modules/python/clusterloader2/image_pull/config/image-pull.yaml similarity index 94% rename from scenarios/perf-eval/image-pull-test/image-pull.yaml rename to modules/python/clusterloader2/image_pull/config/image-pull.yaml index 0e7c3e5ca5..895787b47e 100644 --- a/scenarios/perf-eval/image-pull-test/image-pull.yaml +++ b/modules/python/clusterloader2/image_pull/config/image-pull.yaml @@ -34,7 +34,7 @@ steps: templateFillMap: Replicas: 1 Group: image-pull - Image: .azurecr.io/: # TODO: Replace with your image + Image: akscritelescope.azurecr.io/e2e-test-images/resource-consumer:1.13 - name: Wait for pods to start and metrics to be collected measurements: - identifier: WaitForRunningDeployments diff --git a/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml b/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml new file mode 100644 index 0000000000..a3c8702fac --- /dev/null +++ b/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml @@ -0,0 +1,108 @@ +{{$action := .action}} # start, gather + +steps: + - name: {{$action}} Kubelet Measurements + measurements: + - Identifier: KubeletPodStartupSLIDuration + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: KubeletPodStartupSLIDuration + metricVersion: v1 + unit: s + dimensions: + - node + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(kubelet_pod_start_sli_duration_seconds_bucket[10m])) by (node, le)) + threshold: 5 + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(kubelet_pod_start_sli_duration_seconds_bucket[10m])) by (node, le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(kubelet_pod_start_sli_duration_seconds_bucket[10m])) by (node, le)) + - Identifier: KubeletPodStartupDuration + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: KubeletPodStartupDuration + metricVersion: v1 + unit: s + dimensions: + - node + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_bucket[10m])) by (node, le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(kubelet_pod_start_duration_seconds_bucket[10m])) by (node, le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(kubelet_pod_start_duration_seconds_bucket[10m])) by (node, le)) + - Identifier: KubeletPodStartupTotalDuration + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: KubeletPodStartupTotalDuration + metricVersion: v1 + unit: s + dimensions: + - node + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(kubelet_pod_start_total_duration_seconds_bucket[10m])) by (node, le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(kubelet_pod_start_total_duration_seconds_bucket[10m])) by (node, le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(kubelet_pod_start_total_duration_seconds_bucket[10m])) by (node, le)) + - Identifier: KubeletRuntimeOperationDurationWithoutPullImage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: KubeletRuntimeOperationDurationWithoutPullImage + metricVersion: v1 + unit: s + dimensions: + - node + - operation_type + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) + - name: Sum + query: kubelet_runtime_operations_duration_seconds_sum{operation_type!="pull_image"} + - Identifier: KubeletRuntimeOperationDurationWithPullImage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: KubeletRuntimeOperationDurationWithPullImage + metricVersion: v1 + unit: s + dimensions: + - node + - operation_type + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) + - name: Sum + query: kubelet_runtime_operations_duration_seconds_sum{operation_type="pull_image"} + - Identifier: KubeletRunSandboxDuration + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: KubeletRunSandboxDuration + metricVersion: v1 + unit: s + dimensions: + - node + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(kubelet_run_podsandbox_duration_seconds_bucket[10m])) by (node, le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(kubelet_run_podsandbox_duration_seconds_bucket[10m])) by (node, le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(kubelet_run_podsandbox_duration_seconds_bucket[10m])) by (node, le)) diff --git a/modules/python/clusterloader2/image_pull/image_pull.py b/modules/python/clusterloader2/image_pull/image_pull.py new file mode 100644 index 0000000000..76e51dc944 --- /dev/null +++ b/modules/python/clusterloader2/image_pull/image_pull.py @@ -0,0 +1,172 @@ +"""Image Pull performance test using ClusterLoader2.""" + +import argparse +import json +import os +from datetime import datetime, timezone + +from clusterloader2.utils import parse_xml_to_json, run_cl2_command, get_measurement +from utils.logger_config import get_logger, setup_logging + +setup_logging() +logger = get_logger(__name__) + + +def execute_clusterloader2( + cl2_image: str, + cl2_config_dir: str, + cl2_report_dir: str, + kubeconfig: str, + provider: str +): + """Execute ClusterLoader2 image-pull test.""" + logger.info(f"Starting image-pull test with CL2 image: {cl2_image}") + logger.info(f"Config dir: {cl2_config_dir}, Report dir: {cl2_report_dir}") + + run_cl2_command( + kubeconfig=kubeconfig, + cl2_image=cl2_image, + cl2_config_dir=cl2_config_dir, + cl2_report_dir=cl2_report_dir, + provider=provider, + cl2_config_file="image-pull.yaml", + enable_prometheus=True, + scrape_kubelets=True, + scrape_containerd=True, + tear_down_prometheus=False + ) + + logger.info(f"Test completed. Results in: {cl2_report_dir}") + + +def collect_clusterloader2( + cl2_report_dir: str, + cloud_info: str, + run_id: str, + run_url: str, + result_file: str, + deployment_count: int = 10, + replicas: int = 1 +): + """Collect and format image-pull test results for Kusto ingestion.""" + logger.info(f"Collecting results from: {cl2_report_dir}") + + details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent=2) + json_data = json.loads(details) + testsuites = json_data["testsuites"] + + if testsuites: + status = "success" if testsuites[0]["failures"] == 0 else "failure" + else: + raise ValueError(f"No testsuites found in the report! Raw data: {details}") + + template = { + "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), + "deployment_count": deployment_count, + "replicas": replicas, + "total_pods": deployment_count * replicas, + "status": status, + "group": None, + "measurement": None, + "percentile": None, + "data": None, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url + } + + content = "" + for f in os.listdir(cl2_report_dir): + file_path = os.path.join(cl2_report_dir, f) + if not file_path.endswith('.json'): + continue + + with open(file_path, 'r', encoding='utf-8') as file: + measurement, group_name = get_measurement(file_path) + if not measurement: + continue + + logger.info(f"Processing measurement: {measurement}, group: {group_name}") + data = json.loads(file.read()) + + if "dataItems" in data: + items = data["dataItems"] + if not items: + logger.info(f"No data items found in {file_path}") + continue + for item in items: + template["measurement"] = measurement + template["group"] = group_name + template["percentile"] = "dataItems" + template["data"] = item + content += json.dumps(template) + "\n" + + os.makedirs(os.path.dirname(result_file), exist_ok=True) + with open(result_file, 'w', encoding='utf-8') as file: + file.write(content) + + logger.info(f"Results written to: {result_file}") + + +def main(): + """CLI entry point with subcommands.""" + parser = argparse.ArgumentParser(description="Image Pull performance test") + subparsers = parser.add_subparsers(dest="command") + + # Execute subcommand + parser_execute = subparsers.add_parser("execute", help="Execute image-pull test") + parser_execute.add_argument("--cl2_image", type=str, required=True, + help="CL2 Docker image") + parser_execute.add_argument("--cl2_config_dir", type=str, required=True, + help="Path to CL2 config directory") + parser_execute.add_argument("--cl2_report_dir", type=str, required=True, + help="Path to CL2 report directory") + parser_execute.add_argument("--kubeconfig", type=str, + default=os.path.expanduser("~/.kube/config"), + help="Path to kubeconfig file") + parser_execute.add_argument("--provider", type=str, required=True, + help="Cloud provider (aks, eks, gke)") + + # Collect subcommand + parser_collect = subparsers.add_parser("collect", help="Collect test results") + parser_collect.add_argument("--cl2_report_dir", type=str, required=True, + help="Path to CL2 report directory") + parser_collect.add_argument("--cloud_info", type=str, required=True, + help="Cloud information JSON") + parser_collect.add_argument("--run_id", type=str, required=True, + help="Pipeline run ID") + parser_collect.add_argument("--run_url", type=str, required=True, + help="Pipeline run URL") + parser_collect.add_argument("--result_file", type=str, required=True, + help="Path to output result file") + parser_collect.add_argument("--deployment_count", type=int, default=10, + help="Number of deployments") + parser_collect.add_argument("--replicas", type=int, default=1, + help="Replicas per deployment") + + args = parser.parse_args() + + if args.command == "execute": + execute_clusterloader2( + cl2_image=args.cl2_image, + cl2_config_dir=args.cl2_config_dir, + cl2_report_dir=args.cl2_report_dir, + kubeconfig=args.kubeconfig, + provider=args.provider + ) + elif args.command == "collect": + collect_clusterloader2( + cl2_report_dir=args.cl2_report_dir, + cloud_info=args.cloud_info, + run_id=args.run_id, + run_url=args.run_url, + result_file=args.result_file, + deployment_count=args.deployment_count, + replicas=args.replicas + ) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/modules/python/clusterloader2/image_pull/run_test.py b/modules/python/clusterloader2/image_pull/run_test.py deleted file mode 100644 index 139ac57ba1..0000000000 --- a/modules/python/clusterloader2/image_pull/run_test.py +++ /dev/null @@ -1,125 +0,0 @@ -"""Run ClusterLoader2 image-pull test.""" - -import os -import sys -import shutil -import logging -from pathlib import Path - - -def _copy_files(src_files: list, src_dir: Path, dst_dir: Path) -> None: - """Copy multiple files from src to dst directory.""" - dst_dir.mkdir(parents=True, exist_ok=True) - for f in src_files: - src = src_dir / f if isinstance(f, str) else f - if src.exists(): - shutil.copy(src, dst_dir / src.name) - print(f" - {src.name}") - - -def setup_config_files(scenario_dir: Path, cl2_config_dir: Path, root_dir: Path) -> None: - """Copy configuration files for the test.""" - print(f"Setting up config in {cl2_config_dir}...") - - # Copy scenario files - _copy_files(['image-pull.yaml', 'deployment.yaml', 'containerd-measurements.yaml'], - scenario_dir, cl2_config_dir) - - # Copy kubelet measurements from modules - kubelet_src = root_dir / 'modules/python/clusterloader2/cri/config/kubelet-measurement.yaml' - _copy_files([kubelet_src], root_dir, cl2_config_dir) - - -def run_cl2_test( - kubeconfig: str, - root_dir: str, - scenario_name: str = 'image-pull-test', - cl2_image: str = 'ghcr.io/azure/clusterloader2:v20250311', - prometheus_memory: str = '2Gi', - storage_provisioner: str = 'kubernetes.io/azure-disk', - storage_volume_type: str = 'StandardSSD_LRS' -) -> bool: - """Run ClusterLoader2 image-pull test.""" - try: - from clusterloader2.utils import run_cl2_command - except ImportError: - print("Error: Could not import clusterloader2.utils") - return False - - # Setup paths - root_path = Path(root_dir) - scenario_dir = root_path / 'scenarios/perf-eval' / scenario_name - cl2_config_dir = scenario_dir / 'cl2-config' - results_dir = scenario_dir / 'results' - results_dir.mkdir(parents=True, exist_ok=True) - - # Configure logging - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[logging.FileHandler(results_dir / 'cl2.log', mode='w'), logging.StreamHandler()] - ) - - try: - setup_config_files(scenario_dir, cl2_config_dir, root_path) - - print(f"\n{'='*60}") - print(f"Starting ClusterLoader2 Test") - print(f"Results: {results_dir}") - print(f"{'='*60}\n") - - run_cl2_command( - kubeconfig=kubeconfig, - cl2_image=cl2_image, - cl2_config_dir=str(cl2_config_dir), - cl2_report_dir=str(results_dir), - provider='aks', - cl2_config_file='image-pull.yaml', - enable_prometheus=True, - scrape_kubelets=True, - scrape_containerd=True, - tear_down_prometheus=False, - extra_flags=f"--prometheus-memory-request={prometheus_memory} " - f"--prometheus-storage-class-provisioner={storage_provisioner} " - f"--prometheus-storage-class-volume-type={storage_volume_type}" - ) - - print(f"\nTest completed - Results in: {results_dir}") - return True - - except Exception as e: - print(f"Error: {e}") - return False - - -def main(): - """CLI entry point.""" - import argparse - - parser = argparse.ArgumentParser(description='Run ClusterLoader2 image-pull test') - parser.add_argument('--kubeconfig', default=os.path.expanduser('~/.kube/config')) - parser.add_argument('--root-dir', default=os.environ.get('ROOT_DIR', os.getcwd())) - parser.add_argument('--scenario', default='image-pull-test') - parser.add_argument('--cl2-image', default='ghcr.io/azure/clusterloader2:v20250311') - parser.add_argument('--prometheus-memory', default='2Gi') - parser.add_argument('--storage-provisioner', default='kubernetes.io/azure-disk') - parser.add_argument('--storage-volume-type', default='StandardSSD_LRS') - - args = parser.parse_args() - sys.path.insert(0, os.path.join(args.root_dir, 'modules/python')) - - success = run_cl2_test( - kubeconfig=args.kubeconfig, - root_dir=args.root_dir, - scenario_name=args.scenario, - cl2_image=args.cl2_image, - prometheus_memory=args.prometheus_memory, - storage_provisioner=args.storage_provisioner, - storage_volume_type=args.storage_volume_type - ) - - sys.exit(0 if success else 1) - - -if __name__ == '__main__': - main() diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index df5f582e54..8212b5ae7f 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -25,7 +25,7 @@ def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file="config.yaml", overrides=False, enable_prometheus=False, tear_down_prometheus=True, enable_exec_service=False, scrape_kubelets=False, - scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False, extra_flags=""): + scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False): docker_client = DockerClient() command = f"""--provider={provider} --v=2 @@ -45,9 +45,6 @@ def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provi if overrides: command += " --testoverrides=/root/perf-tests/clusterloader2/config/overrides.yaml" - if extra_flags: - command += f" {extra_flags}" - volumes = { kubeconfig: {'bind': '/root/.kube/config', 'mode': 'rw'}, cl2_config_dir: {'bind': '/root/perf-tests/clusterloader2/config', 'mode': 'rw'}, diff --git a/modules/python/tests/test_image_pull.py b/modules/python/tests/test_image_pull.py new file mode 100644 index 0000000000..a3b0dc2620 --- /dev/null +++ b/modules/python/tests/test_image_pull.py @@ -0,0 +1,198 @@ +"""Unit tests for image_pull module.""" + +import json +import os +import tempfile +import unittest +from unittest.mock import patch + +from clusterloader2.image_pull.image_pull import ( + execute_clusterloader2, + collect_clusterloader2, + main +) + + +class TestImagePullFunctions(unittest.TestCase): + """Test cases for image_pull execute and collect functions.""" + + @patch('clusterloader2.image_pull.image_pull.run_cl2_command') + def test_execute_clusterloader2(self, mock_run_cl2): + """Test execute_clusterloader2 calls run_cl2_command with correct params.""" + execute_clusterloader2( + cl2_image="ghcr.io/azure/clusterloader2:v20250311", + cl2_config_dir="/tmp/config", + cl2_report_dir="/tmp/report", + kubeconfig="/tmp/kubeconfig", + provider="aks" + ) + + mock_run_cl2.assert_called_once_with( + kubeconfig="/tmp/kubeconfig", + cl2_image="ghcr.io/azure/clusterloader2:v20250311", + cl2_config_dir="/tmp/config", + cl2_report_dir="/tmp/report", + provider="aks", + cl2_config_file="image-pull.yaml", + enable_prometheus=True, + scrape_kubelets=True, + scrape_containerd=True, + tear_down_prometheus=False + ) + + @patch('clusterloader2.image_pull.image_pull.get_measurement') + @patch('clusterloader2.image_pull.image_pull.parse_xml_to_json') + def test_collect_clusterloader2_success(self, mock_parse_xml, mock_get_measurement): + """Test collect_clusterloader2 with successful test results.""" + # Mock junit.xml parsing - success case + mock_parse_xml.return_value = json.dumps({ + "testsuites": [{"failures": 0, "tests": 1}] + }) + + with tempfile.TemporaryDirectory() as tmpdir: + report_dir = os.path.join(tmpdir, "report") + os.makedirs(report_dir) + + # Create a mock measurement file + measurement_file = os.path.join(report_dir, "ImagePullLatency_test.json") + with open(measurement_file, 'w', encoding='utf-8') as f: + json.dump({ + "dataItems": [ + {"labels": {"node": "node1"}, "data": {"P50": 1.5, "P99": 3.0}} + ] + }, f) + + # Create junit.xml (required by parse_xml_to_json) + junit_file = os.path.join(report_dir, "junit.xml") + with open(junit_file, 'w', encoding='utf-8') as f: + f.write("") + + mock_get_measurement.return_value = ("ImagePullLatency", "test") + + result_file = os.path.join(tmpdir, "results", "output.json") + + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info='{"cloud": "azure", "region": "eastus2"}', + run_id="12345", + run_url="https://dev.azure.com/run/12345", + result_file=result_file, + deployment_count=10, + replicas=1 + ) + + # Verify result file was created + self.assertTrue(os.path.exists(result_file)) + + # Verify content + with open(result_file, 'r', encoding='utf-8') as f: + content = f.read() + self.assertIn("ImagePullLatency", content) + self.assertIn("success", content) + self.assertIn("12345", content) + + @patch('clusterloader2.image_pull.image_pull.parse_xml_to_json') + def test_collect_clusterloader2_failure(self, mock_parse_xml): + """Test collect_clusterloader2 with failed test results.""" + # Mock junit.xml parsing - failure case + mock_parse_xml.return_value = json.dumps({ + "testsuites": [{"failures": 1, "tests": 1}] + }) + + with tempfile.TemporaryDirectory() as tmpdir: + report_dir = os.path.join(tmpdir, "report") + os.makedirs(report_dir) + + # Create junit.xml + junit_file = os.path.join(report_dir, "junit.xml") + with open(junit_file, 'w', encoding='utf-8') as f: + f.write("") + + result_file = os.path.join(tmpdir, "results", "output.json") + + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info='{"cloud": "azure"}', + run_id="12345", + run_url="https://dev.azure.com/run/12345", + result_file=result_file + ) + + # Result file should exist even for failures + self.assertTrue(os.path.exists(result_file)) + + @patch('clusterloader2.image_pull.image_pull.parse_xml_to_json') + def test_collect_clusterloader2_no_testsuites(self, mock_parse_xml): + """Test collect_clusterloader2 raises error when no testsuites found.""" + # Mock junit.xml with empty testsuites + mock_parse_xml.return_value = json.dumps({"testsuites": []}) + + with tempfile.TemporaryDirectory() as tmpdir: + report_dir = os.path.join(tmpdir, "report") + os.makedirs(report_dir) + + junit_file = os.path.join(report_dir, "junit.xml") + with open(junit_file, 'w', encoding='utf-8') as f: + f.write("") + + result_file = os.path.join(tmpdir, "results", "output.json") + + with self.assertRaises(ValueError) as context: + collect_clusterloader2( + cl2_report_dir=report_dir, + cloud_info='{"cloud": "azure"}', + run_id="12345", + run_url="https://dev.azure.com/run/12345", + result_file=result_file + ) + + self.assertIn("No testsuites found", str(context.exception)) + + +class TestImagePullMain(unittest.TestCase): + """Test cases for CLI main function.""" + + @patch('clusterloader2.image_pull.image_pull.execute_clusterloader2') + def test_main_execute_command(self, mock_execute): + """Test main function with execute subcommand.""" + test_args = [ + 'image_pull.py', 'execute', + '--cl2_image', 'ghcr.io/azure/clusterloader2:v20250311', + '--cl2_config_dir', '/tmp/config', + '--cl2_report_dir', '/tmp/report', + '--provider', 'aks' + ] + + with patch('sys.argv', test_args): + main() + + mock_execute.assert_called_once() + call_kwargs = mock_execute.call_args[1] + self.assertEqual(call_kwargs['cl2_image'], 'ghcr.io/azure/clusterloader2:v20250311') + self.assertEqual(call_kwargs['provider'], 'aks') + + @patch('clusterloader2.image_pull.image_pull.collect_clusterloader2') + def test_main_collect_command(self, mock_collect): + """Test main function with collect subcommand.""" + test_args = [ + 'image_pull.py', 'collect', + '--cl2_report_dir', '/tmp/report', + '--cloud_info', '{"cloud": "azure"}', + '--run_id', '12345', + '--run_url', 'https://dev.azure.com/run/12345', + '--result_file', '/tmp/result.json', + '--deployment_count', '10', + '--replicas', '1' + ] + + with patch('sys.argv', test_args): + main() + + mock_collect.assert_called_once() + call_kwargs = mock_collect.call_args[1] + self.assertEqual(call_kwargs['deployment_count'], 10) + self.assertEqual(call_kwargs['replicas'], 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..4eeec098f2 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,28 @@ trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: image-pull-test stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2_image_pull dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250311" + topology: image-pull + matrix: + image-pull-10pods: + deployment_count: 10 + replicas: 1 + kubernetes_version: "1.31" + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 60 diff --git a/scenarios/perf-eval/image-pull-test/README.md b/scenarios/perf-eval/image-pull-test/README.md index 1edae1a049..a9bb52d167 100644 --- a/scenarios/perf-eval/image-pull-test/README.md +++ b/scenarios/perf-eval/image-pull-test/README.md @@ -6,28 +6,11 @@ Measures container image pull performance on AKS clusters using ClusterLoader2. ## Test Scenario -Creates 10 Deployments with 1 replica each (10 pods total), pulling a large container image to measure: +Creates 10 Deployments with 1 replica each (10 pods total), pulling a container image to measure: - How fast images are pulled across cluster nodes -- Pod startup latency when pulling large images +- Pod startup latency when pulling images - Containerd throughput during parallel image pulls -### Default Configuration - -| Parameter | Value | -|-----------|-------| -| Deployments | 10 | -| Replicas per deployment | 1 | -| Total pods | 10 | -| QPS (deployment creation rate) | 10 | -| Pod startup timeout | 3 minutes | -| Metrics collection wait | 10 minutes | -| Test image | pytorch-large:2.0.0 (~15GB) | - -To modify, edit `image-pull.yaml`: -- `replicasPerNamespace`: Number of deployments -- `Replicas`: Pods per deployment -- `qps`: Deployment creation rate - ## Metrics Collected | Metric | Source | Description | @@ -37,66 +20,31 @@ To modify, edit `image-pull.yaml`: | Network Plugin Operations | containerd:10257 | Pod network setup/teardown time | | Pod Startup Latency | API server | End-to-end pod scheduling time | -## Prerequisites - -- AKS cluster with containerd runtime -- Azure Container Registry with test image -- kubectl, terraform, az CLI, docker - ## Configuration -### 1. Set your container image - -Edit `image-pull.yaml` line 37: -```yaml -Image: .azurecr.io/: -``` - -### 2. Set your ACR (in notebook) +### Test Image -Edit `run_locally.ipynb` cell 9: -```bash -export ACR_NAME= -export ACR_SUBSCRIPTION_ID= # if different from AKS subscription -``` +The test uses `akscritelescope.azurecr.io/e2e-test-images/resource-consumer:1.13` by default. -### 3. Attach ACR to AKS +To change the image, edit `modules/python/clusterloader2/image_pull/config/image-pull.yaml`. -The notebook handles this automatically, or run manually: -```bash -az aks update -g -n --attach-acr -``` +### Cluster Settings -## Usage +Edit `scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars` for cluster configuration. -### Run via Notebook -```bash -# Open and run cells sequentially -jupyter notebook run_locally.ipynb -``` +## Pipeline -### Run via CLI -```bash -export ROOT_DIR=$(git rev-parse --show-toplevel) -./run_cl2.sh # Run test -./analyze_results.sh # Analyze results -``` +The test runs via Azure DevOps pipeline: +- **Pipeline**: `pipelines/perf-eval/CRI Benchmark/image-pull.yml` +- **Engine**: `steps/engine/clusterloader2/image_pull/` +- **Topology**: `steps/topology/image-pull/` ## Files -| File | Purpose | +| Path | Purpose | |------|---------| -| `image-pull.yaml` | CL2 test config - defines workload and measurements | -| `deployment.yaml` | Pod template for image pull test | -| `containerd-measurements.yaml` | Prometheus queries for containerd metrics | -| `run_cl2.sh` | Shell wrapper to run test | -| `analyze_results.sh` | Shell wrapper to analyze results | -| `run_locally.ipynb` | Interactive notebook for local testing | -| `terraform-inputs/azure.tfvars` | AKS cluster configuration | - -## Output - -Results are written to `results/` directory: -- `junit.xml` - Test pass/fail status -- `PodStartupLatency_*.json` - Pod startup metrics -- `GenericPrometheusQuery_*.json` - Prometheus metric snapshots +| `modules/python/clusterloader2/image_pull/` | Python module and CL2 config | +| `steps/engine/clusterloader2/image_pull/` | Pipeline engine steps | +| `steps/topology/image-pull/` | Pipeline topology steps | +| `pipelines/perf-eval/CRI Benchmark/image-pull.yml` | Pipeline definition | +| `scenarios/perf-eval/image-pull-test/terraform-inputs/` | Terraform configuration | diff --git a/scenarios/perf-eval/image-pull-test/analyze_results.sh b/scenarios/perf-eval/image-pull-test/analyze_results.sh deleted file mode 100755 index 85bc7f5904..0000000000 --- a/scenarios/perf-eval/image-pull-test/analyze_results.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# -# Analyze ClusterLoader2 results -# - -set -e - -if [ -z "$ROOT_DIR" ]; then - echo "Error: ROOT_DIR is not set. Please run the setup cell first." - exit 1 -fi - -export PYTHONPATH="${ROOT_DIR}/modules/python:${PYTHONPATH}" -RESULTS_DIR="${ROOT_DIR}/scenarios/perf-eval/image-pull-test/results" - -python3 -m clusterloader2.image_pull.analyze_results "$RESULTS_DIR" "$@" - -exit $? diff --git a/scenarios/perf-eval/image-pull-test/containerd-measurements.yaml b/scenarios/perf-eval/image-pull-test/containerd-measurements.yaml deleted file mode 100644 index 0d559a7bfb..0000000000 --- a/scenarios/perf-eval/image-pull-test/containerd-measurements.yaml +++ /dev/null @@ -1,78 +0,0 @@ -{{$action := .action}} # start, gather - -steps: - - name: {{$action}} Containerd Measurements - measurements: - # ContainerdCriImagePullingThroughput - WORKS (has histogram buckets) - - identifier: ContainerdCriImagePullingThroughput - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriImagePullingThroughput - metricVersion: v1 - unit: MB/s - queries: - - name: Perc100 - query: histogram_quantile(1, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Sum - query: sum(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) - - name: Average - query: sum(rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])) - - # ContainerdCriNetworkPluginOperations - Sum/Count only (histograms may not work) - - identifier: ContainerdCriNetworkPluginOperations - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriNetworkPluginOperations - metricVersion: v1 - unit: s - dimensions: - - operation_type - queries: - - name: Sum - query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}) by (operation_type) - - name: Count - query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}) by (operation_type) - - name: Average - query: sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}[%v])) by (operation_type) / sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}[%v])) by (operation_type) - - # ContainerdCriSandboxCreateNetwork - Sum/Count only - - identifier: ContainerdCriSandboxCreateNetwork - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriSandboxCreateNetwork - metricVersion: v1 - unit: s - queries: - - name: Sum - query: sum(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}) - - name: Average - query: sum(rate(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}[%v])) - - # ContainerdCriSandboxDeleteNetwork - Sum/Count only - - identifier: ContainerdCriSandboxDeleteNetwork - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriSandboxDeleteNetwork - metricVersion: v1 - unit: s - queries: - - name: Sum - query: sum(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}) - - name: Average - query: sum(rate(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}[%v])) diff --git a/scenarios/perf-eval/image-pull-test/run_cl2.sh b/scenarios/perf-eval/image-pull-test/run_cl2.sh deleted file mode 100755 index 8d0296282b..0000000000 --- a/scenarios/perf-eval/image-pull-test/run_cl2.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# -# Run ClusterLoader2 image-pull test -# - -set -e - -if [ -z "$ROOT_DIR" ]; then - echo "Error: ROOT_DIR is not set. Please run the setup cell first." - exit 1 -fi - -# Install docker for root (required for sudo access to docker socket) -echo "Ensuring 'docker' library is installed for root..." -sudo python3 -m pip install docker >/dev/null 2>&1 - -export KUBECONFIG_PATH=${KUBECONFIG:-$HOME/.kube/config} -export PYTHONPATH="$ROOT_DIR/modules/python:$PYTHONPATH" - -sudo -E PYTHONPATH="$PYTHONPATH" python3 -m clusterloader2.image_pull.run_test \ - --kubeconfig "$KUBECONFIG_PATH" \ - --root-dir "$ROOT_DIR" \ - --scenario "image-pull-test" \ - --cl2-image "ghcr.io/azure/clusterloader2:v20250311" \ - --prometheus-memory "2Gi" \ - --storage-provisioner "kubernetes.io/azure-disk" \ - --storage-volume-type "StandardSSD_LRS" - -exit $? diff --git a/scenarios/perf-eval/image-pull-test/run_locally.ipynb b/scenarios/perf-eval/image-pull-test/run_locally.ipynb deleted file mode 100644 index 3777cd34bd..0000000000 --- a/scenarios/perf-eval/image-pull-test/run_locally.ipynb +++ /dev/null @@ -1,411 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "c5dfa25a", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Check prerequisites\n", - "echo \"Checking prerequisites...\"\n", - "echo \"===========================================\"\n", - "\n", - "command -v terraform >/dev/null && terraform version || echo 'terraform missing'\n", - "command -v az >/dev/null && az version --output table || echo 'azure cli missing'\n", - "command -v jq >/dev/null && jq --version || echo 'jq missing'\n", - "command -v kubectl >/dev/null && kubectl version --client || echo 'kubectl missing'\n", - "\n", - "# Set ROOT_DIR to the repository root\n", - "if git rev-parse --show-toplevel >/dev/null 2>&1; then\n", - " export ROOT_DIR=$(git rev-parse --show-toplevel)\n", - "else\n", - " # Fallback: go up 3 levels from scenarios/perf-eval/image-pull-test\n", - " export ROOT_DIR=$(cd ../../.. && pwd)\n", - "fi\n", - "\n", - "echo \"Repository Root: $ROOT_DIR\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c6fa2de", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Install Python dependencies\n", - "if [ -z \"$ROOT_DIR\" ]; then\n", - " echo \"Error: ROOT_DIR is not set. Please run the first cell to initialize variables.\"\n", - " exit 1\n", - "fi\n", - "\n", - "echo \"Installing python requirements...\"\n", - "python3 -m pip install --user -r $ROOT_DIR/modules/python/requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50c0917f", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Define test scenario variables\n", - "export SCENARIO_TYPE=perf-eval\n", - "export SCENARIO_NAME=image-pull-test\n", - "export OWNER=$(whoami)\n", - "export RUN_ID=${RUN_ID:-$(date +%s)}\n", - "export CLOUD=azure\n", - "export REGION=eastus2\n", - "export AZURE_SUBSCRIPTION_ID=\"c0d4b923-b5ea-4f8f-9b56-5390a9bf2248\"\n", - "export SKU_TIER=Standard\n", - "export KUBERNETES_VERSION=1.31\n", - "export NETWORK_POLICY=\"\"\n", - "export NETWORK_DATAPLANE=azure\n", - "export TERRAFORM_MODULES_DIR=$ROOT_DIR/modules/terraform/$CLOUD\n", - "export TERRAFORM_INPUT_FILE=$ROOT_DIR/scenarios/$SCENARIO_TYPE/$SCENARIO_NAME/terraform-inputs/${CLOUD}.tfvars\n", - "export SYSTEM_NODE_POOL=null\n", - "export USER_NODE_POOL=null\n", - "\n", - "echo \"Scenario: $SCENARIO_TYPE/$SCENARIO_NAME\"\n", - "echo \"Run ID: $RUN_ID\"\n", - "echo \"Terraform Input: $TERRAFORM_INPUT_FILE\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae75239f", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Azure login\n", - "echo \"Azure Authentication\"\n", - "if az account show >/dev/null 2>&1; then\n", - " echo \"Already logged in\"\n", - " az account set -s $AZURE_SUBSCRIPTION_ID\n", - "else\n", - " echo \"Logging into Azure...\"\n", - " az login --use-device-code\n", - " az account set -s $AZURE_SUBSCRIPTION_ID\n", - "fi\n", - "export ARM_SUBSCRIPTION_ID=$(az account show --query id -o tsv)\n", - "export ARM_TENANT_ID=$(az account show --query tenantId -o tsv)\n", - "az account show --query '{Name:name, Id:id}' --output table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f69d63b", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Create resource group\n", - "echo \"Creating resource group $RUN_ID in $REGION\"\n", - "az group create --name $RUN_ID --location $REGION --tags run_id=$RUN_ID scenario=${SCENARIO_TYPE}-${SCENARIO_NAME} owner=$OWNER" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ce7afe4", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Prepare Terraform input JSON\n", - "echo \"Preparing Terraform input JSON\"\n", - "\n", - "# Ensure variables are set to defaults if empty to avoid jq errors\n", - ": ${NETWORK_POLICY:=\"\"}\n", - ": ${NETWORK_DATAPLANE:=\"\"}\n", - ": ${SYSTEM_NODE_POOL:=\"null\"}\n", - ": ${USER_NODE_POOL:=\"null\"}\n", - "\n", - "export INPUT_JSON=$(jq -n \\\n", - " --arg run_id \"$RUN_ID\" \\\n", - " --arg region \"$REGION\" \\\n", - " --arg aks_sku_tier \"$SKU_TIER\" \\\n", - " --arg aks_kubernetes_version \"$KUBERNETES_VERSION\" \\\n", - " --arg aks_network_policy \"$NETWORK_POLICY\" \\\n", - " --arg aks_network_dataplane \"$NETWORK_DATAPLANE\" \\\n", - " --arg k8s_machine_type \"Standard_D4s_v3\" \\\n", - " --arg k8s_os_disk_type \"Managed\" \\\n", - " --argjson aks_cli_system_node_pool \"$SYSTEM_NODE_POOL\" \\\n", - " --argjson aks_cli_user_node_pool \"$USER_NODE_POOL\" \\\n", - " '{run_id:$run_id,region:$region,aks_sku_tier:$aks_sku_tier,aks_kubernetes_version:$aks_kubernetes_version,aks_network_policy:$aks_network_policy,aks_network_dataplane:$aks_network_dataplane,k8s_machine_type:$k8s_machine_type,k8s_os_disk_type:$k8s_os_disk_type,aks_cli_system_node_pool:$aks_cli_system_node_pool,aks_cli_user_node_pool:$aks_cli_user_node_pool}' | jq 'with_entries(select(.value != null and .value != \"\"))')\n", - "echo $INPUT_JSON | jq ." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a78b6c4b", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Terraform init & plan\n", - "pushd $TERRAFORM_MODULES_DIR\n", - "terraform init\n", - "terraform plan -var json_input=\"$(echo $INPUT_JSON | jq -c .)\" -var-file $TERRAFORM_INPUT_FILE\n", - "popd\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75097d61", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Terraform apply\n", - "pushd $TERRAFORM_MODULES_DIR\n", - "terraform apply -var json_input=\"$(echo $INPUT_JSON | jq -c .)\" -var-file $TERRAFORM_INPUT_FILE --auto-approve\n", - "popd\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79bf5007", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Attach ACR permissions so AKS can pull private images\n", - "# TODO: Set your ACR name and subscription ID\n", - "export ACR_NAME=${ACR_NAME:-}\n", - "export ACR_SUBSCRIPTION_ID=${ACR_SUBSCRIPTION_ID:-}\n", - "\n", - "# Automatically find the cluster name in the resource group\n", - "export CLUSTER_NAME=$(az aks list --resource-group $RUN_ID --query \"[0].name\" -o tsv)\n", - "\n", - "if [ -z \"$CLUSTER_NAME\" ]; then\n", - " echo \"Error: No AKS cluster found in resource group $RUN_ID\"\n", - " exit 1\n", - "fi\n", - "\n", - "# Resolve ACR Resource ID (Required for cross-subscription attach)\n", - "if [ -n \"$ACR_SUBSCRIPTION_ID\" ]; then\n", - " echo \"Looking up ACR '$ACR_NAME' in subscription '$ACR_SUBSCRIPTION_ID'...\"\n", - " ACR_ID=$(az acr show --name $ACR_NAME --subscription $ACR_SUBSCRIPTION_ID --query id -o tsv)\n", - "else\n", - " echo \"Looking up ACR '$ACR_NAME' in current subscription...\"\n", - " ACR_ID=$(az acr show --name $ACR_NAME --query id -o tsv 2>/dev/null)\n", - "fi\n", - "\n", - "if [ -z \"$ACR_ID\" ]; then\n", - " echo \"Warning: Could not find ACR ID. Attempting to attach by name...\"\n", - " ACR_ID=$ACR_NAME\n", - "else\n", - " echo \"Found ACR ID: $ACR_ID\"\n", - "fi\n", - "\n", - "echo \"Attaching registry to cluster $CLUSTER_NAME...\"\n", - "az aks update --resource-group $RUN_ID --name $CLUSTER_NAME --attach-acr $ACR_ID" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2735b27", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Get kubeconfig\n", - "if [ -z \"$CLUSTER_NAME\" ]; then\n", - " export CLUSTER_NAME=$(az aks list --resource-group $RUN_ID --query \"[0].name\" -o tsv)\n", - "fi\n", - "\n", - "echo \"Getting credentials for $CLUSTER_NAME...\"\n", - "az aks get-credentials --resource-group $RUN_ID --name $CLUSTER_NAME --overwrite-existing\n", - "kubectl get nodes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96222b71", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Clean up previous Prometheus resources\n", - "echo \"Removing stale monitoring.coreos.com resources\"\n", - "for crd in alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com prometheuses.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com probes.monitoring.coreos.com; do\n", - " kubectl delete crd $crd --ignore-not-found\n", - "done\n", - "for cr in prometheus-operator prometheus-operator-psp prometheus-operator-cm; do\n", - " kubectl delete clusterrole $cr --ignore-not-found\n", - " kubectl delete clusterrolebinding $cr --ignore-not-found\n", - "done" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82504470", - "metadata": { - "language": "python", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Run ClusterLoader2 image-pull scenario\n", - "$ROOT_DIR/scenarios/perf-eval/image-pull-test/run_cl2.sh\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d36979ac", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Show result files\n", - "ls -lah $ROOT_DIR/scenarios/perf-eval/image-pull-test/results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8b67a1f", - "metadata": { - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Debug: Check if Prometheus has the raw containerd metrics\n", - "echo \"Checking Prometheus for containerd histogram metrics...\"\n", - "\n", - "# Port-forward to Prometheus\n", - "kubectl port-forward -n monitoring svc/prometheus-operated 9090:9090 &\n", - "PF_PID=$!\n", - "sleep 3\n", - "\n", - "echo \"\"\n", - "echo \"=== containerd_cri_image_pull_duration_seconds_bucket ===\"\n", - "curl -s 'http://localhost:9090/api/v1/query?query=containerd_cri_image_pull_duration_seconds_bucket' | jq -r '.data.result | length' | xargs -I {} echo \"Found {} time series\"\n", - "\n", - "echo \"\"\n", - "echo \"=== kubelet_runtime_operations_duration_seconds_bucket ===\"\n", - "curl -s 'http://localhost:9090/api/v1/query?query=kubelet_runtime_operations_duration_seconds_bucket{operation_type=\"pull_image\"}' | jq -r '.data.result | length' | xargs -I {} echo \"Found {} time series\"\n", - "\n", - "# Kill port-forward\n", - "kill $PF_PID 2>/dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d986df0a", - "metadata": { - "language": "shellscript", - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Cleanup resources\n", - "pushd $TERRAFORM_MODULES_DIR\n", - "terraform destroy -var json_input=\"$(echo $INPUT_JSON | jq -c .)\" -var-file $TERRAFORM_INPUT_FILE --auto-approve\n", - "popd\n", - "az group delete --name $RUN_ID -y\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b31c9c8", - "metadata": { - "vscode": { - "languageId": "shellscript" - } - }, - "outputs": [], - "source": [ - "# Analyze Results\n", - "$ROOT_DIR/scenarios/perf-eval/image-pull-test/analyze_results.sh\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/steps/engine/clusterloader2/image_pull/collect.yml b/steps/engine/clusterloader2/image_pull/collect.yml new file mode 100644 index 0000000000..980ca997c3 --- /dev/null +++ b/steps/engine/clusterloader2/image_pull/collect.yml @@ -0,0 +1,32 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: region + type: string + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml + parameters: + region: ${{ parameters.region }} + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ + --cl2_report_dir $CL2_REPORT_DIR \ + --cloud_info "$CLOUD_INFO" \ + --run_id $RUN_ID \ + --run_url $RUN_URL \ + --result_file $TEST_RESULTS_FILE \ + --deployment_count ${DEPLOYMENT_COUNT:-10} \ + --replicas ${REPLICAS:-1} + workingDirectory: modules/python + env: + CLOUD: ${{ parameters.cloud }} + RUN_URL: $(RUN_URL) + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/image_pull.py + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/results + displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/image_pull/execute.yml b/steps/engine/clusterloader2/image_pull/execute.yml new file mode 100644 index 0000000000..ded0fe39e8 --- /dev/null +++ b/steps/engine/clusterloader2/image_pull/execute.yml @@ -0,0 +1,34 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: region + type: string + +steps: + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ + --cl2_image ${CL2_IMAGE} \ + --cl2_config_dir ${CL2_CONFIG_DIR} \ + --cl2_report_dir $CL2_REPORT_DIR \ + --kubeconfig ${HOME}/.kube/config \ + --provider $CLOUD + workingDirectory: modules/python + env: + ${{ if eq(parameters.cloud, 'azure') }}: + CLOUD: aks + ${{ elseif eq(parameters.cloud, 'aws') }}: + CLOUD: eks + ${{ else }}: + CLOUD: ${{ parameters.cloud }} + REGION: ${{ parameters.region }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/image_pull.py + CL2_IMAGE: ${{ parameters.engine_input.image }} + CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/config + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/results + displayName: "Run Image Pull Benchmark" diff --git a/steps/topology/image-pull/collect-clusterloader2.yml b/steps/topology/image-pull/collect-clusterloader2.yml new file mode 100644 index 0000000000..4e1fb387c6 --- /dev/null +++ b/steps/topology/image-pull/collect-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/image_pull/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/image-pull/execute-clusterloader2.yml b/steps/topology/image-pull/execute-clusterloader2.yml new file mode 100644 index 0000000000..dfa809aff1 --- /dev/null +++ b/steps/topology/image-pull/execute-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/image_pull/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/image-pull/validate-resources.yml b/steps/topology/image-pull/validate-resources.yml new file mode 100644 index 0000000000..35b420318a --- /dev/null +++ b/steps/topology/image-pull/validate-resources.yml @@ -0,0 +1,19 @@ +parameters: +- name: cloud + type: string +- name: engine + type: string +- name: regions + type: object + +steps: +- template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: client + region: ${{ parameters.regions[0] }} +- script: | + set -eo pipefail + echo "Validating cluster resources..." + kubectl get nodes -o wide + echo "Cluster is ready for image-pull test" + displayName: "Validate Resources" From 1ae1992137636851075a4e3f5decdcd077de692a Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Wed, 10 Dec 2025 03:23:50 +0000 Subject: [PATCH 03/30] Fix terraform test input - use minimal JSON --- .../image-pull-test/terraform-test-inputs/azure.json | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json b/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json index 9d4a536039..e5f2ac736c 100644 --- a/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json +++ b/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json @@ -1,12 +1,4 @@ { "run_id": "test-run", - "region": "eastus2", - "aks_sku_tier": "Standard", - "aks_kubernetes_version": "1.29", - "aks_network_policy": "none", - "aks_network_dataplane": "azure", - "k8s_machine_type": "Standard_D4s_v3", - "k8s_os_disk_type": "Managed", - "aks_cli_system_node_pool": null, - "aks_cli_user_node_pool": null + "region": "eastus2" } From f6c56086a112a8c3e576439cd3c8f6d81dbbe8ca Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Wed, 10 Dec 2025 04:05:46 +0000 Subject: [PATCH 04/30] Fix K8s version to 1.33 and revert gitignore --- .gitignore | 2 -- pipelines/system/new-pipeline-test.yml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index bd3407232e..4ad23c88c4 100644 --- a/.gitignore +++ b/.gitignore @@ -76,8 +76,6 @@ env/ venv/ modules/python/clusterloader2/*/results modules/python/clusterloader2/*/config/overrides.yaml -scenarios/**/results -scenarios/**/cl2-config #Jupyter *-checkpoint.ipynb diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 4eeec098f2..403e2cbc69 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -21,7 +21,7 @@ stages: image-pull-10pods: deployment_count: 10 replicas: 1 - kubernetes_version: "1.31" + kubernetes_version: "1.33" max_parallel: 1 credential_type: service_connection ssh_key_enabled: false From f393815e6406ba88adc4fcfecc08f43e119cf3a7 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Wed, 10 Dec 2025 04:50:34 +0000 Subject: [PATCH 05/30] Fix Prometheus setup for image-pull test --- .../clusterloader2/image_pull/image_pull.py | 17 +++++++ modules/python/tests/test_image_pull.py | 21 +++++++- .../terraform-inputs/azure.tfvars | 51 ++++++++++++++++--- 3 files changed, 81 insertions(+), 8 deletions(-) diff --git a/modules/python/clusterloader2/image_pull/image_pull.py b/modules/python/clusterloader2/image_pull/image_pull.py index 76e51dc944..65e1ed7aa9 100644 --- a/modules/python/clusterloader2/image_pull/image_pull.py +++ b/modules/python/clusterloader2/image_pull/image_pull.py @@ -12,6 +12,19 @@ logger = get_logger(__name__) +def write_overrides(cl2_config_dir: str, provider: str): + """Write CL2 override file with Prometheus configuration.""" + override_file = os.path.join(cl2_config_dir, "overrides.yaml") + with open(override_file, "w", encoding="utf-8") as file: + file.write(f"CL2_PROVIDER: {provider}\n") + file.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") + file.write("CL2_PROMETHEUS_CPU_SCALE_FACTOR: 30.0\n") + file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 30.0\n") + file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n") + file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n") + logger.info(f"Wrote overrides file: {override_file}") + + def execute_clusterloader2( cl2_image: str, cl2_config_dir: str, @@ -23,6 +36,9 @@ def execute_clusterloader2( logger.info(f"Starting image-pull test with CL2 image: {cl2_image}") logger.info(f"Config dir: {cl2_config_dir}, Report dir: {cl2_report_dir}") + # Write overrides file with Prometheus configuration + write_overrides(cl2_config_dir, provider) + run_cl2_command( kubeconfig=kubeconfig, cl2_image=cl2_image, @@ -30,6 +46,7 @@ def execute_clusterloader2( cl2_report_dir=cl2_report_dir, provider=provider, cl2_config_file="image-pull.yaml", + overrides=True, enable_prometheus=True, scrape_kubelets=True, scrape_containerd=True, diff --git a/modules/python/tests/test_image_pull.py b/modules/python/tests/test_image_pull.py index a3b0dc2620..28e0225ee0 100644 --- a/modules/python/tests/test_image_pull.py +++ b/modules/python/tests/test_image_pull.py @@ -9,6 +9,7 @@ from clusterloader2.image_pull.image_pull import ( execute_clusterloader2, collect_clusterloader2, + write_overrides, main ) @@ -16,8 +17,24 @@ class TestImagePullFunctions(unittest.TestCase): """Test cases for image_pull execute and collect functions.""" + def test_write_overrides(self): + """Test write_overrides creates correct override file.""" + with tempfile.TemporaryDirectory() as tmpdir: + write_overrides(tmpdir, "aks") + + override_file = os.path.join(tmpdir, "overrides.yaml") + self.assertTrue(os.path.exists(override_file)) + + with open(override_file, 'r', encoding='utf-8') as f: + content = f.read() + + self.assertIn("CL2_PROVIDER: aks", content) + self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content) + self.assertIn("CL2_PROMETHEUS_NODE_SELECTOR", content) + @patch('clusterloader2.image_pull.image_pull.run_cl2_command') - def test_execute_clusterloader2(self, mock_run_cl2): + @patch('clusterloader2.image_pull.image_pull.write_overrides') + def test_execute_clusterloader2(self, mock_write_overrides, mock_run_cl2): """Test execute_clusterloader2 calls run_cl2_command with correct params.""" execute_clusterloader2( cl2_image="ghcr.io/azure/clusterloader2:v20250311", @@ -27,6 +44,7 @@ def test_execute_clusterloader2(self, mock_run_cl2): provider="aks" ) + mock_write_overrides.assert_called_once_with("/tmp/config", "aks") mock_run_cl2.assert_called_once_with( kubeconfig="/tmp/kubeconfig", cl2_image="ghcr.io/azure/clusterloader2:v20250311", @@ -34,6 +52,7 @@ def test_execute_clusterloader2(self, mock_run_cl2): cl2_report_dir="/tmp/report", provider="aks", cl2_config_file="image-pull.yaml", + overrides=True, enable_prometheus=True, scrape_kubelets=True, scrape_containerd=True, diff --git a/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars index 704d049c4c..4f20a2976c 100644 --- a/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars @@ -3,25 +3,62 @@ scenario_name = "image-pull-test" deletion_delay = "2h" owner = "telescope" +network_config_list = [ + { + role = "client" + vnet_name = "imgpull-vnet" + vnet_address_space = "10.0.0.0/9" + subnet = [ + { + name = "imgpull-subnet-1" + address_prefix = "10.0.0.0/16" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + aks_config_list = [ { role = "client" aks_name = "img-pull-10" - dns_prefix = "kperf" - subnet_name = "aks-network" + dns_prefix = "imgpull" + subnet_name = "imgpull-vnet" sku_tier = "Standard" network_profile = { network_plugin = "azure" network_plugin_mode = "overlay" + pod_cidr = "10.0.0.0/9" + service_cidr = "192.168.0.0/16" + dns_service_ip = "192.168.0.10" } default_node_pool = { - name = "userpool0" - node_count = 10 + name = "default" + node_count = 3 vm_size = "Standard_D4s_v3" os_disk_type = "Managed" - only_critical_addons_enabled = false - temporary_name_for_rotation = "temp" + only_critical_addons_enabled = true + temporary_name_for_rotation = "defaulttmp" } - extra_node_pool = [] + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + os_disk_type = "Managed" + node_labels = { "prometheus" = "true" } + }, + { + name = "userpool" + node_count = 10 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v3" + os_disk_type = "Managed" + node_labels = { "image-pull-test" = "true" } + } + ] } ] From 3d7f271e2de75affcd66445dd715dbb164b81192 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Thu, 11 Dec 2025 05:30:55 +0000 Subject: [PATCH 06/30] Rename image-pull scenario --- .../config/containerd-measurements.yaml | 9 +-- .../config/kubelet-measurement.yaml | 73 +------------------ .../clusterloader2/image_pull/image_pull.py | 21 ++++-- modules/python/tests/test_image_pull.py | 12 ++- pipelines/system/new-pipeline-test.yml | 2 +- scenarios/perf-eval/image-pull-test/README.md | 50 ------------- .../terraform-inputs/azure.tfvars | 4 +- .../terraform-test-inputs/azure.json | 0 .../clusterloader2/image_pull/execute.yml | 18 +++++ 9 files changed, 48 insertions(+), 141 deletions(-) delete mode 100644 scenarios/perf-eval/image-pull-test/README.md rename scenarios/perf-eval/{image-pull-test => image-pull-throughput}/terraform-inputs/azure.tfvars (96%) rename scenarios/perf-eval/{image-pull-test => image-pull-throughput}/terraform-test-inputs/azure.json (100%) diff --git a/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml b/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml index deb7e8e837..7280b34367 100644 --- a/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml +++ b/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml @@ -1,9 +1,8 @@ ---- {{$action := .action}} + steps: - name: {{$action}} Containerd Measurements measurements: - # ContainerdCriImagePullingThroughput - WORKS (has histogram buckets) - identifier: ContainerdCriImagePullingThroughput method: GenericPrometheusQuery params: @@ -12,12 +11,10 @@ steps: metricVersion: v1 unit: MB/s queries: - - name: Perc100 - query: histogram_quantile(1, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - name: Perc99 query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc75 + query: histogram_quantile(0.75, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - name: Perc50 query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - name: Sum diff --git a/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml b/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml index a3c8702fac..5fcfa2261b 100644 --- a/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml +++ b/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml @@ -3,55 +3,6 @@ steps: - name: {{$action}} Kubelet Measurements measurements: - - Identifier: KubeletPodStartupSLIDuration - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: KubeletPodStartupSLIDuration - metricVersion: v1 - unit: s - dimensions: - - node - queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(kubelet_pod_start_sli_duration_seconds_bucket[10m])) by (node, le)) - threshold: 5 - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(kubelet_pod_start_sli_duration_seconds_bucket[10m])) by (node, le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(kubelet_pod_start_sli_duration_seconds_bucket[10m])) by (node, le)) - - Identifier: KubeletPodStartupDuration - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: KubeletPodStartupDuration - metricVersion: v1 - unit: s - dimensions: - - node - queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_bucket[10m])) by (node, le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(kubelet_pod_start_duration_seconds_bucket[10m])) by (node, le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(kubelet_pod_start_duration_seconds_bucket[10m])) by (node, le)) - - Identifier: KubeletPodStartupTotalDuration - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: KubeletPodStartupTotalDuration - metricVersion: v1 - unit: s - dimensions: - - node - queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(kubelet_pod_start_total_duration_seconds_bucket[10m])) by (node, le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(kubelet_pod_start_total_duration_seconds_bucket[10m])) by (node, le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(kubelet_pod_start_total_duration_seconds_bucket[10m])) by (node, le)) - Identifier: KubeletRuntimeOperationDurationWithoutPullImage Method: GenericPrometheusQuery Params: @@ -65,8 +16,8 @@ steps: queries: - name: Perc99 query: histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) + - name: Perc75 + query: histogram_quantile(0.75, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - name: Perc50 query: histogram_quantile(0.50, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - name: Sum @@ -84,25 +35,9 @@ steps: queries: - name: Perc99 query: histogram_quantile(0.99, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) + - name: Perc75 + query: histogram_quantile(0.75, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - name: Perc50 query: histogram_quantile(0.50, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - name: Sum query: kubelet_runtime_operations_duration_seconds_sum{operation_type="pull_image"} - - Identifier: KubeletRunSandboxDuration - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: KubeletRunSandboxDuration - metricVersion: v1 - unit: s - dimensions: - - node - queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(kubelet_run_podsandbox_duration_seconds_bucket[10m])) by (node, le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(kubelet_run_podsandbox_duration_seconds_bucket[10m])) by (node, le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(kubelet_run_podsandbox_duration_seconds_bucket[10m])) by (node, le)) diff --git a/modules/python/clusterloader2/image_pull/image_pull.py b/modules/python/clusterloader2/image_pull/image_pull.py index 65e1ed7aa9..676e9af8c8 100644 --- a/modules/python/clusterloader2/image_pull/image_pull.py +++ b/modules/python/clusterloader2/image_pull/image_pull.py @@ -12,8 +12,8 @@ logger = get_logger(__name__) -def write_overrides(cl2_config_dir: str, provider: str): - """Write CL2 override file with Prometheus configuration.""" +def override_config_clusterloader2(cl2_config_dir: str, provider: str): + """Override CL2 config file with Prometheus configuration.""" override_file = os.path.join(cl2_config_dir, "overrides.yaml") with open(override_file, "w", encoding="utf-8") as file: file.write(f"CL2_PROVIDER: {provider}\n") @@ -36,9 +36,6 @@ def execute_clusterloader2( logger.info(f"Starting image-pull test with CL2 image: {cl2_image}") logger.info(f"Config dir: {cl2_config_dir}, Report dir: {cl2_report_dir}") - # Write overrides file with Prometheus configuration - write_overrides(cl2_config_dir, provider) - run_cl2_command( kubeconfig=kubeconfig, cl2_image=cl2_image, @@ -130,6 +127,13 @@ def main(): parser = argparse.ArgumentParser(description="Image Pull performance test") subparsers = parser.add_subparsers(dest="command") + # Override subcommand + parser_override = subparsers.add_parser("override", help="Override CL2 config file") + parser_override.add_argument("--cl2_config_dir", type=str, required=True, + help="Path to CL2 config directory") + parser_override.add_argument("--provider", type=str, required=True, + help="Cloud provider (aks, eks, gke)") + # Execute subcommand parser_execute = subparsers.add_parser("execute", help="Execute image-pull test") parser_execute.add_argument("--cl2_image", type=str, required=True, @@ -163,7 +167,12 @@ def main(): args = parser.parse_args() - if args.command == "execute": + if args.command == "override": + override_config_clusterloader2( + cl2_config_dir=args.cl2_config_dir, + provider=args.provider + ) + elif args.command == "execute": execute_clusterloader2( cl2_image=args.cl2_image, cl2_config_dir=args.cl2_config_dir, diff --git a/modules/python/tests/test_image_pull.py b/modules/python/tests/test_image_pull.py index 28e0225ee0..1f55be92a5 100644 --- a/modules/python/tests/test_image_pull.py +++ b/modules/python/tests/test_image_pull.py @@ -9,7 +9,7 @@ from clusterloader2.image_pull.image_pull import ( execute_clusterloader2, collect_clusterloader2, - write_overrides, + override_config_clusterloader2, main ) @@ -17,10 +17,10 @@ class TestImagePullFunctions(unittest.TestCase): """Test cases for image_pull execute and collect functions.""" - def test_write_overrides(self): - """Test write_overrides creates correct override file.""" + def test_override_config_clusterloader2(self): + """Test override_config_clusterloader2 creates correct override file.""" with tempfile.TemporaryDirectory() as tmpdir: - write_overrides(tmpdir, "aks") + override_config_clusterloader2(tmpdir, "aks") override_file = os.path.join(tmpdir, "overrides.yaml") self.assertTrue(os.path.exists(override_file)) @@ -33,8 +33,7 @@ def test_write_overrides(self): self.assertIn("CL2_PROMETHEUS_NODE_SELECTOR", content) @patch('clusterloader2.image_pull.image_pull.run_cl2_command') - @patch('clusterloader2.image_pull.image_pull.write_overrides') - def test_execute_clusterloader2(self, mock_write_overrides, mock_run_cl2): + def test_execute_clusterloader2(self, mock_run_cl2): """Test execute_clusterloader2 calls run_cl2_command with correct params.""" execute_clusterloader2( cl2_image="ghcr.io/azure/clusterloader2:v20250311", @@ -44,7 +43,6 @@ def test_execute_clusterloader2(self, mock_write_overrides, mock_run_cl2): provider="aks" ) - mock_write_overrides.assert_called_once_with("/tmp/config", "aks") mock_run_cl2.assert_called_once_with( kubeconfig="/tmp/kubeconfig", cl2_image="ghcr.io/azure/clusterloader2:v20250311", diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 403e2cbc69..10e795c6a9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -2,7 +2,7 @@ trigger: none variables: SCENARIO_TYPE: perf-eval - SCENARIO_NAME: image-pull-test + SCENARIO_NAME: image-pull-throughput stages: - stage: azure_eastus2_image_pull diff --git a/scenarios/perf-eval/image-pull-test/README.md b/scenarios/perf-eval/image-pull-test/README.md deleted file mode 100644 index a9bb52d167..0000000000 --- a/scenarios/perf-eval/image-pull-test/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# Image Pull Performance Test - -## Overview - -Measures container image pull performance on AKS clusters using ClusterLoader2. - -## Test Scenario - -Creates 10 Deployments with 1 replica each (10 pods total), pulling a container image to measure: -- How fast images are pulled across cluster nodes -- Pod startup latency when pulling images -- Containerd throughput during parallel image pulls - -## Metrics Collected - -| Metric | Source | Description | -|--------|--------|-------------| -| Kubelet Image Pull Duration | kubelet:10250 | P50/P90/P99 latency per node | -| Containerd Throughput | containerd:10257 | MB/s, total data, pull count | -| Network Plugin Operations | containerd:10257 | Pod network setup/teardown time | -| Pod Startup Latency | API server | End-to-end pod scheduling time | - -## Configuration - -### Test Image - -The test uses `akscritelescope.azurecr.io/e2e-test-images/resource-consumer:1.13` by default. - -To change the image, edit `modules/python/clusterloader2/image_pull/config/image-pull.yaml`. - -### Cluster Settings - -Edit `scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars` for cluster configuration. - -## Pipeline - -The test runs via Azure DevOps pipeline: -- **Pipeline**: `pipelines/perf-eval/CRI Benchmark/image-pull.yml` -- **Engine**: `steps/engine/clusterloader2/image_pull/` -- **Topology**: `steps/topology/image-pull/` - -## Files - -| Path | Purpose | -|------|---------| -| `modules/python/clusterloader2/image_pull/` | Python module and CL2 config | -| `steps/engine/clusterloader2/image_pull/` | Pipeline engine steps | -| `steps/topology/image-pull/` | Pipeline topology steps | -| `pipelines/perf-eval/CRI Benchmark/image-pull.yml` | Pipeline definition | -| `scenarios/perf-eval/image-pull-test/terraform-inputs/` | Terraform configuration | diff --git a/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-throughput/terraform-inputs/azure.tfvars similarity index 96% rename from scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars rename to scenarios/perf-eval/image-pull-throughput/terraform-inputs/azure.tfvars index 4f20a2976c..e8ba6f36ae 100644 --- a/scenarios/perf-eval/image-pull-test/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/image-pull-throughput/terraform-inputs/azure.tfvars @@ -1,7 +1,7 @@ scenario_type = "perf-eval" -scenario_name = "image-pull-test" +scenario_name = "image-pull-throughput" deletion_delay = "2h" -owner = "telescope" +owner = "acr" network_config_list = [ { diff --git a/scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json b/scenarios/perf-eval/image-pull-throughput/terraform-test-inputs/azure.json similarity index 100% rename from scenarios/perf-eval/image-pull-test/terraform-test-inputs/azure.json rename to scenarios/perf-eval/image-pull-throughput/terraform-test-inputs/azure.json diff --git a/steps/engine/clusterloader2/image_pull/execute.yml b/steps/engine/clusterloader2/image_pull/execute.yml index ded0fe39e8..877a3060be 100644 --- a/steps/engine/clusterloader2/image_pull/execute.yml +++ b/steps/engine/clusterloader2/image_pull/execute.yml @@ -9,6 +9,24 @@ parameters: type: string steps: + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE override \ + --cl2_config_dir ${CL2_CONFIG_DIR} \ + --provider $CLOUD + workingDirectory: modules/python + env: + ${{ if eq(parameters.cloud, 'azure') }}: + CLOUD: aks + ${{ elseif eq(parameters.cloud, 'aws') }}: + CLOUD: eks + ${{ else }}: + CLOUD: ${{ parameters.cloud }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/image_pull.py + CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/config + displayName: "Override CL2 Config" + - script: | set -eo pipefail From 0430f5a3e2254509b4cff8b50aca29247a40657f Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Fri, 12 Dec 2025 05:52:36 +0000 Subject: [PATCH 07/30] Fix image-pull-n10 scenario to follow existing conventions --- .../config/containerd-measurements.yaml | 18 +---- ...ployment.yaml => deployment_template.yaml} | 3 +- .../image_pull/config/image-pull.yaml | 79 +++++++++++-------- .../config/kubelet-measurement.yaml | 8 +- scenarios/perf-eval/image-pull-n10/README.md | 40 ++++++++++ .../terraform-inputs/azure.tfvars | 2 +- .../terraform-test-inputs/azure.json | 0 7 files changed, 97 insertions(+), 53 deletions(-) rename modules/python/clusterloader2/image_pull/config/{deployment.yaml => deployment_template.yaml} (92%) create mode 100644 scenarios/perf-eval/image-pull-n10/README.md rename scenarios/perf-eval/{image-pull-throughput => image-pull-n10}/terraform-inputs/azure.tfvars (97%) rename scenarios/perf-eval/{image-pull-throughput => image-pull-n10}/terraform-test-inputs/azure.json (100%) diff --git a/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml b/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml index 7280b34367..c84e4cf71b 100644 --- a/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml +++ b/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml @@ -13,18 +13,14 @@ steps: queries: - name: Perc99 query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc75 - query: histogram_quantile(0.75, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - name: Perc50 query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - name: Sum query: sum(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}) - name: Count query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) - - name: Average - query: sum(rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])) - - # ContainerdCriNetworkPluginOperations - Sum/Count only (histograms may not work) - identifier: ContainerdCriNetworkPluginOperations method: GenericPrometheusQuery params: @@ -39,10 +35,6 @@ steps: query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}) by (operation_type) - name: Count query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}) by (operation_type) - - name: Average - query: sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}[%v])) by (operation_type) / sum(rate(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}[%v])) by (operation_type) - - # ContainerdCriSandboxCreateNetwork - Sum/Count only - identifier: ContainerdCriSandboxCreateNetwork method: GenericPrometheusQuery params: @@ -55,10 +47,6 @@ steps: query: sum(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}) - name: Count query: sum(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}) - - name: Average - query: sum(rate(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}[%v])) - - # ContainerdCriSandboxDeleteNetwork - Sum/Count only - identifier: ContainerdCriSandboxDeleteNetwork method: GenericPrometheusQuery params: @@ -71,5 +59,3 @@ steps: query: sum(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}) - name: Count query: sum(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}) - - name: Average - query: sum(rate(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}[%v])) diff --git a/modules/python/clusterloader2/image_pull/config/deployment.yaml b/modules/python/clusterloader2/image_pull/config/deployment_template.yaml similarity index 92% rename from modules/python/clusterloader2/image_pull/config/deployment.yaml rename to modules/python/clusterloader2/image_pull/config/deployment_template.yaml index a355709f01..bc10f2d621 100644 --- a/modules/python/clusterloader2/image_pull/config/deployment.yaml +++ b/modules/python/clusterloader2/image_pull/config/deployment_template.yaml @@ -8,10 +8,11 @@ spec: replicas: {{.Replicas}} selector: matchLabels: - group: {{.Group}} + name: {{.Name}} template: metadata: labels: + name: {{.Name}} group: {{.Group}} spec: containers: diff --git a/modules/python/clusterloader2/image_pull/config/image-pull.yaml b/modules/python/clusterloader2/image_pull/config/image-pull.yaml index 895787b47e..4f780661ef 100644 --- a/modules/python/clusterloader2/image_pull/config/image-pull.yaml +++ b/modules/python/clusterloader2/image_pull/config/image-pull.yaml @@ -1,24 +1,39 @@ name: image-pull-test + +{{$replicasPerNamespace := DefaultParam .CL2_REPLICAS_PER_NAMESPACE 10}} +{{$replicas := DefaultParam .CL2_REPLICAS 1}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "5m"}} +{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "3m"}} +{{$image := DefaultParam .CL2_IMAGE "akscritelescope.azurecr.io/e2e-test-images/resource-consumer:1.13"}} +{{$groupName := DefaultParam .CL2_GROUP_NAME "image-pull"}} + namespace: number: 1 + prefix: image-pull + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + tuningSets: - name: UniformQPS qpsLoad: qps: 10 + steps: -- measurements: - - identifier: PodStartupLatency - method: PodStartupLatency - params: +- name: Start measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: action: start - labelSelector: group = image-pull - threshold: 3m + labelSelector: group = {{$groupName}} + threshold: {{$podStartupLatencyThreshold}} - module: - path: containerd-measurements.yaml + path: /containerd-measurements.yaml params: action: start - module: - path: kubelet-measurement.yaml + path: /kubelet-measurement.yaml params: action: start - name: Start deployment @@ -26,46 +41,48 @@ steps: - namespaceRange: min: 1 max: 1 - replicasPerNamespace: 10 + replicasPerNamespace: {{$replicasPerNamespace}} tuningSet: UniformQPS objectBundle: - basename: image-pull-deployment - objectTemplatePath: deployment.yaml + objectTemplatePath: deployment_template.yaml templateFillMap: - Replicas: 1 - Group: image-pull - Image: akscritelescope.azurecr.io/e2e-test-images/resource-consumer:1.13 + Replicas: {{$replicas}} + Group: {{$groupName}} + Image: {{$image}} - name: Wait for pods to start and metrics to be collected measurements: - - identifier: WaitForRunningDeployments - method: WaitForControlledPodsRunning - params: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: action: start apiVersion: apps/v1 kind: Deployment - labelSelector: group = image-pull - operationTimeout: 5m -- measurements: - - identifier: WaitForRunningDeployments - method: WaitForControlledPodsRunning - params: + labelSelector: group = {{$groupName}} + operationTimeout: {{$operationTimeout}} +- name: Waiting for pods to be running + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: action: gather - name: Wait for containerd metrics to accumulate (histogram buckets need multiple scrapes) measurements: - - identifier: Sleep - method: Sleep - params: + - Identifier: Sleep + Method: Sleep + Params: duration: 10m - module: - path: containerd-measurements.yaml + path: /containerd-measurements.yaml params: action: gather - module: - path: kubelet-measurement.yaml + path: /kubelet-measurement.yaml params: action: gather -- measurements: - - identifier: PodStartupLatency - method: PodStartupLatency - params: +- name: Gather measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: action: gather diff --git a/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml b/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml index 5fcfa2261b..d44a852d19 100644 --- a/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml +++ b/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml @@ -16,8 +16,8 @@ steps: queries: - name: Perc99 query: histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - - name: Perc75 - query: histogram_quantile(0.75, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - name: Perc50 query: histogram_quantile(0.50, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - name: Sum @@ -35,8 +35,8 @@ steps: queries: - name: Perc99 query: histogram_quantile(0.99, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - - name: Perc75 - query: histogram_quantile(0.75, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - name: Perc50 query: histogram_quantile(0.50, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - name: Sum diff --git a/scenarios/perf-eval/image-pull-n10/README.md b/scenarios/perf-eval/image-pull-n10/README.md new file mode 100644 index 0000000000..0bea247830 --- /dev/null +++ b/scenarios/perf-eval/image-pull-n10/README.md @@ -0,0 +1,40 @@ +# Scenario: image-pull-n10 + +## Overview + +This scenario measures container image pull throughput and performance on AKS clusters. It benchmarks how quickly nodes can pull container images from registries under various conditions. + +## Infrastructure + +| Component | Configuration | +|-----------|---------------| +| Cloud Provider | Azure | +| Cluster SKU | Standard | +| Network Plugin | Azure CNI Overlay | +| Default Node Pool | 3 x Standard_D4s_v3 | +| Prometheus Pool | 1 x Standard_D8s_v3 | +| User Pool | 10 x Standard_D4s_v3 | + +## Node Pools + +| Pool | Purpose | Node Count | VM Size | Labels | +|------|---------|------------|---------|--------| +| default | System/critical addons | 3 | Standard_D4s_v3 | - | +| prompool | Prometheus monitoring | 1 | Standard_D8s_v3 | `prometheus=true` | +| userpool | Image pull tests | 10 | Standard_D4s_v3 | `image-pull-test=true` | + +## Network Configuration + +- VNet: `10.0.0.0/9` +- Pod CIDR: `10.0.0.0/9` +- Service CIDR: `192.168.0.0/16` +- DNS Service IP: `192.168.0.10` + +## Usage + +Tests are executed on nodes labeled with `image-pull-test=true` in the user pool. + +## References + +- [Best Practices](../../../docs/best-practices.md) +- [Test Scenario Implementation Guide](../../../docs/test-scenario-implementation-guide.md) diff --git a/scenarios/perf-eval/image-pull-throughput/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars similarity index 97% rename from scenarios/perf-eval/image-pull-throughput/terraform-inputs/azure.tfvars rename to scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars index e8ba6f36ae..acfc1f2185 100644 --- a/scenarios/perf-eval/image-pull-throughput/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars @@ -1,5 +1,5 @@ scenario_type = "perf-eval" -scenario_name = "image-pull-throughput" +scenario_name = "image-pull-n10" deletion_delay = "2h" owner = "acr" diff --git a/scenarios/perf-eval/image-pull-throughput/terraform-test-inputs/azure.json b/scenarios/perf-eval/image-pull-n10/terraform-test-inputs/azure.json similarity index 100% rename from scenarios/perf-eval/image-pull-throughput/terraform-test-inputs/azure.json rename to scenarios/perf-eval/image-pull-n10/terraform-test-inputs/azure.json From ef74ffba757e0cb234f487ef73541e1fe789216c Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Fri, 12 Dec 2025 06:34:14 +0000 Subject: [PATCH 08/30] Update pipeline to use image-pull-n10 scenario name --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 10e795c6a9..61ec9db132 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -2,7 +2,7 @@ trigger: none variables: SCENARIO_TYPE: perf-eval - SCENARIO_NAME: image-pull-throughput + SCENARIO_NAME: image-pull-n10 stages: - stage: azure_eastus2_image_pull From 2940ef897a738a7d50beac908a4d8641349b8b64 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Sun, 14 Dec 2025 23:33:04 +0000 Subject: [PATCH 09/30] Add image-pull-n10 pipeline and revert new-pipeline-test.yml --- .../Image Pull Benchmark/image-pull-n10.yml | 28 +++++++++++++++ pipelines/system/new-pipeline-test.yml | 35 +++++++++---------- scenarios/perf-eval/image-pull-n10/README.md | 4 +-- 3 files changed, 46 insertions(+), 21 deletions(-) create mode 100644 pipelines/perf-eval/Image Pull Benchmark/image-pull-n10.yml diff --git a/pipelines/perf-eval/Image Pull Benchmark/image-pull-n10.yml b/pipelines/perf-eval/Image Pull Benchmark/image-pull-n10.yml new file mode 100644 index 0000000000..61ec9db132 --- /dev/null +++ b/pipelines/perf-eval/Image Pull Benchmark/image-pull-n10.yml @@ -0,0 +1,28 @@ +trigger: none + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: image-pull-n10 + +stages: + - stage: azure_eastus2_image_pull + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250311" + topology: image-pull + matrix: + image-pull-10pods: + deployment_count: 10 + replicas: 1 + kubernetes_version: "1.33" + max_parallel: 1 + credential_type: service_connection + ssh_key_enabled: false + timeout_in_minutes: 60 diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 61ec9db132..63d55f02d9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,28 +1,25 @@ trigger: none variables: - SCENARIO_TYPE: perf-eval - SCENARIO_NAME: image-pull-n10 + SCENARIO_TYPE: + SCENARIO_NAME: stages: - - stage: azure_eastus2_image_pull + - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) dependsOn: [] jobs: - - template: /jobs/competitive-test.yml + - template: /jobs/competitive-test.yml # must keep as is parameters: - cloud: azure - regions: - - eastus2 - engine: clusterloader2 - engine_input: - image: "ghcr.io/azure/clusterloader2:v20250311" - topology: image-pull - matrix: - image-pull-10pods: - deployment_count: 10 - replicas: 1 - kubernetes_version: "1.33" - max_parallel: 1 - credential_type: service_connection + cloud: # e.g. azure, aws + regions: # list of regions + - region1 # e.g. eastus2 + topology: # e.g. cluster-autoscaler + engine: # e.g. clusterloader2 + matrix: # list of test parameters to customize the provisioned resources + : + : + : + max_parallel: # required + credential_type: service_connection # required ssh_key_enabled: false - timeout_in_minutes: 60 + timeout_in_minutes: 60 # if not specified, default is 60 diff --git a/scenarios/perf-eval/image-pull-n10/README.md b/scenarios/perf-eval/image-pull-n10/README.md index 0bea247830..a71fae787e 100644 --- a/scenarios/perf-eval/image-pull-n10/README.md +++ b/scenarios/perf-eval/image-pull-n10/README.md @@ -1,8 +1,8 @@ -# Scenario: image-pull-n10 +# image-pull-n10 ## Overview -This scenario measures container image pull throughput and performance on AKS clusters. It benchmarks how quickly nodes can pull container images from registries under various conditions. +Deploys 10 pods with `imagePullPolicy: Always` to trigger image pulls, and collects containerd image pulling throughput (MB/s) and kubelet runtime operation duration metrics using CL2. ## Infrastructure From 59bf82f38d9edc8fb9304061f6ec4b8eebe90992 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 15 Dec 2025 04:38:33 +0000 Subject: [PATCH 10/30] refactor: reuse CRI module with scrape_containerd toggle for image-pull scenario --- .../clusterloader2/cri/config/config.yaml | 15 ++ .../cri/config/containerd-measurements.yaml | 59 +++++ modules/python/clusterloader2/cri/cri.py | 23 +- .../config/containerd-measurements.yaml | 61 ----- .../config/deployment_template.yaml | 32 --- .../image_pull/config/image-pull.yaml | 88 ------- .../config/kubelet-measurement.yaml | 43 ---- .../clusterloader2/image_pull/image_pull.py | 198 ---------------- modules/python/tests/test_cri.py | 17 +- modules/python/tests/test_image_pull.py | 215 ------------------ .../image-pull-n10.yml | 8 +- pipelines/system/new-pipeline-test.yml | 35 +-- scenarios/perf-eval/image-pull-n10/README.md | 24 +- steps/engine/clusterloader2/cri/execute.yml | 4 +- .../clusterloader2/image_pull/collect.yml | 32 --- .../clusterloader2/image_pull/execute.yml | 52 ----- .../image-pull/collect-clusterloader2.yml | 17 -- .../image-pull/execute-clusterloader2.yml | 17 -- .../image-pull/validate-resources.yml | 19 -- 19 files changed, 137 insertions(+), 822 deletions(-) create mode 100644 modules/python/clusterloader2/cri/config/containerd-measurements.yaml delete mode 100644 modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml delete mode 100644 modules/python/clusterloader2/image_pull/config/deployment_template.yaml delete mode 100644 modules/python/clusterloader2/image_pull/config/image-pull.yaml delete mode 100644 modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml delete mode 100644 modules/python/clusterloader2/image_pull/image_pull.py delete mode 100644 modules/python/tests/test_image_pull.py rename pipelines/perf-eval/{Image Pull Benchmark => CRI Benchmark}/image-pull-n10.yml (77%) delete mode 100644 steps/engine/clusterloader2/image_pull/collect.yml delete mode 100644 steps/engine/clusterloader2/image_pull/execute.yml delete mode 100644 steps/topology/image-pull/collect-clusterloader2.yml delete mode 100644 steps/topology/image-pull/execute-clusterloader2.yml delete mode 100644 steps/topology/image-pull/validate-resources.yml diff --git a/modules/python/clusterloader2/cri/config/config.yaml b/modules/python/clusterloader2/cri/config/config.yaml index 745175fc64..1696b68d45 100644 --- a/modules/python/clusterloader2/cri/config/config.yaml +++ b/modules/python/clusterloader2/cri/config/config.yaml @@ -20,6 +20,7 @@ name: resource-consumer {{$provider := DefaultParam .CL2_PROVIDER "aks"}} {{$osType := DefaultParam .CL2_OS_TYPE "linux"}} {{$scrapeKubelets := DefaultParam .CL2_SCRAPE_KUBELETS false}} +{{$scrapeContainerd := DefaultParam .CL2_SCRAPE_CONTAINERD false}} {{$hostNetwork := DefaultParam .CL2_HOST_NETWORK "true"}} namespace: @@ -67,6 +68,13 @@ steps: action: start {{end}} + {{if $scrapeContainerd}} + - module: + path: /containerd-measurements.yaml + params: + action: start + {{end}} + {{range $j := Loop $steps}} - name: Create deployment {{$j}} phases: @@ -141,6 +149,13 @@ steps: action: gather {{end}} + {{if $scrapeContainerd}} + - module: + path: /containerd-measurements.yaml + params: + action: gather + {{end}} + {{range $j := Loop $steps}} - name: Deleting deployments {{$j}} phases: diff --git a/modules/python/clusterloader2/cri/config/containerd-measurements.yaml b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml new file mode 100644 index 0000000000..2c33ffae1f --- /dev/null +++ b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml @@ -0,0 +1,59 @@ +{{$action := .action}} # start, gather + +steps: + - name: {{$action}} Containerd Measurements + measurements: + - Identifier: ContainerdCriImagePullingThroughput + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ContainerdCriImagePullingThroughput + metricVersion: v1 + unit: MB/s + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Count + query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) + - Identifier: ContainerdCriNetworkPluginOperations + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ContainerdCriNetworkPluginOperations + metricVersion: v1 + unit: s + dimensions: + - operation_type + queries: + - name: Sum + query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}) by (operation_type) + - name: Count + query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}) by (operation_type) + - Identifier: ContainerdCriSandboxCreateNetwork + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ContainerdCriSandboxCreateNetwork + metricVersion: v1 + unit: s + queries: + - name: Sum + query: sum(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}) + - Identifier: ContainerdCriSandboxDeleteNetwork + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ContainerdCriSandboxDeleteNetwork + metricVersion: v1 + unit: s + queries: + - name: Sum + query: sum(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}) + - name: Count + query: sum(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}) diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index 9cebebf502..85b4e725fd 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -17,7 +17,7 @@ def override_config_clusterloader2( node_count, node_per_step, max_pods, repeats, operation_timeout, load_type, scale_enabled, pod_startup_latency_threshold, provider, - os_type, scrape_kubelets, host_network, override_file): + os_type, scrape_kubelets, scrape_containerd, host_network, override_file): client = KubernetesClient(os.path.expanduser("~/.kube/config")) nodes = client.get_nodes(label_selector="cri-resource-consume=true") if len(nodes) == 0: @@ -90,13 +90,14 @@ def override_config_clusterloader2( file.write(f"CL2_PROVIDER: {provider}\n") file.write(f"CL2_OS_TYPE: {os_type}\n") file.write(f"CL2_SCRAPE_KUBELETS: {str(scrape_kubelets).lower()}\n") + file.write(f"CL2_SCRAPE_CONTAINERD: {str(scrape_containerd).lower()}\n") file.write(f"CL2_HOST_NETWORK: {str(host_network).lower()}\n") file.close() -def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider, scrape_kubelets): +def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider, scrape_kubelets, scrape_containerd): run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, overrides=True, enable_prometheus=True, - tear_down_prometheus=False, scrape_kubelets=scrape_kubelets) + tear_down_prometheus=False, scrape_kubelets=scrape_kubelets, scrape_containerd=scrape_containerd) def verify_measurement(): client = KubernetesClient(os.path.expanduser("~/.kube/config")) @@ -258,6 +259,13 @@ def main(): default=False, help="Whether to scrape kubelets", ) + parser_override.add_argument( + "--scrape_containerd", + type=str2bool, + choices=[True, False], + default=False, + help="Whether to scrape containerd", + ) parser_override.add_argument( "--host_network", type=str2bool, @@ -291,6 +299,13 @@ def main(): default=False, help="Whether to scrape kubelets", ) + parser_execute.add_argument( + "--scrape_containerd", + type=str2bool, + choices=[True, False], + default=False, + help="Whether to scrape containerd", + ) # Sub-command for collect_clusterloader2 parser_collect = subparsers.add_parser( @@ -344,6 +359,7 @@ def main(): args.provider, args.os_type, args.scrape_kubelets, + args.scrape_containerd, args.host_network, args.cl2_override_file, ) @@ -355,6 +371,7 @@ def main(): args.kubeconfig, args.provider, args.scrape_kubelets, + args.scrape_containerd, ) elif args.command == "collect": collect_clusterloader2( diff --git a/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml b/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml deleted file mode 100644 index c84e4cf71b..0000000000 --- a/modules/python/clusterloader2/image_pull/config/containerd-measurements.yaml +++ /dev/null @@ -1,61 +0,0 @@ -{{$action := .action}} - -steps: - - name: {{$action}} Containerd Measurements - measurements: - - identifier: ContainerdCriImagePullingThroughput - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriImagePullingThroughput - metricVersion: v1 - unit: MB/s - queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Sum - query: sum(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) - - identifier: ContainerdCriNetworkPluginOperations - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriNetworkPluginOperations - metricVersion: v1 - unit: s - dimensions: - - operation_type - queries: - - name: Sum - query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}) by (operation_type) - - name: Count - query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}) by (operation_type) - - identifier: ContainerdCriSandboxCreateNetwork - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriSandboxCreateNetwork - metricVersion: v1 - unit: s - queries: - - name: Sum - query: sum(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}) - - identifier: ContainerdCriSandboxDeleteNetwork - method: GenericPrometheusQuery - params: - action: {{$action}} - metricName: ContainerdCriSandboxDeleteNetwork - metricVersion: v1 - unit: s - queries: - - name: Sum - query: sum(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}) diff --git a/modules/python/clusterloader2/image_pull/config/deployment_template.yaml b/modules/python/clusterloader2/image_pull/config/deployment_template.yaml deleted file mode 100644 index bc10f2d621..0000000000 --- a/modules/python/clusterloader2/image_pull/config/deployment_template.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{.Name}} - labels: - group: {{.Group}} -spec: - replicas: {{.Replicas}} - selector: - matchLabels: - name: {{.Name}} - template: - metadata: - labels: - name: {{.Name}} - group: {{.Group}} - spec: - containers: - - name: test-container - image: {{.Image}} - imagePullPolicy: Always - command: ["sleep"] - args: ["3600"] - resources: - requests: - cpu: 100m - memory: 100Mi - readinessProbe: - exec: - command: ["true"] - initialDelaySeconds: 5 - periodSeconds: 10 diff --git a/modules/python/clusterloader2/image_pull/config/image-pull.yaml b/modules/python/clusterloader2/image_pull/config/image-pull.yaml deleted file mode 100644 index 4f780661ef..0000000000 --- a/modules/python/clusterloader2/image_pull/config/image-pull.yaml +++ /dev/null @@ -1,88 +0,0 @@ -name: image-pull-test - -{{$replicasPerNamespace := DefaultParam .CL2_REPLICAS_PER_NAMESPACE 10}} -{{$replicas := DefaultParam .CL2_REPLICAS 1}} -{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "5m"}} -{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "3m"}} -{{$image := DefaultParam .CL2_IMAGE "akscritelescope.azurecr.io/e2e-test-images/resource-consumer:1.13"}} -{{$groupName := DefaultParam .CL2_GROUP_NAME "image-pull"}} - -namespace: - number: 1 - prefix: image-pull - deleteStaleNamespaces: true - deleteAutomanagedNamespaces: true - enableExistingNamespaces: false - -tuningSets: -- name: UniformQPS - qpsLoad: - qps: 10 - -steps: -- name: Start measurements - measurements: - - Identifier: PodStartupLatency - Method: PodStartupLatency - Params: - action: start - labelSelector: group = {{$groupName}} - threshold: {{$podStartupLatencyThreshold}} -- module: - path: /containerd-measurements.yaml - params: - action: start -- module: - path: /kubelet-measurement.yaml - params: - action: start -- name: Start deployment - phases: - - namespaceRange: - min: 1 - max: 1 - replicasPerNamespace: {{$replicasPerNamespace}} - tuningSet: UniformQPS - objectBundle: - - basename: image-pull-deployment - objectTemplatePath: deployment_template.yaml - templateFillMap: - Replicas: {{$replicas}} - Group: {{$groupName}} - Image: {{$image}} -- name: Wait for pods to start and metrics to be collected - measurements: - - Identifier: WaitForRunningDeployments - Method: WaitForControlledPodsRunning - Params: - action: start - apiVersion: apps/v1 - kind: Deployment - labelSelector: group = {{$groupName}} - operationTimeout: {{$operationTimeout}} -- name: Waiting for pods to be running - measurements: - - Identifier: WaitForRunningDeployments - Method: WaitForControlledPodsRunning - Params: - action: gather -- name: Wait for containerd metrics to accumulate (histogram buckets need multiple scrapes) - measurements: - - Identifier: Sleep - Method: Sleep - Params: - duration: 10m -- module: - path: /containerd-measurements.yaml - params: - action: gather -- module: - path: /kubelet-measurement.yaml - params: - action: gather -- name: Gather measurements - measurements: - - Identifier: PodStartupLatency - Method: PodStartupLatency - Params: - action: gather diff --git a/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml b/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml deleted file mode 100644 index d44a852d19..0000000000 --- a/modules/python/clusterloader2/image_pull/config/kubelet-measurement.yaml +++ /dev/null @@ -1,43 +0,0 @@ -{{$action := .action}} # start, gather - -steps: - - name: {{$action}} Kubelet Measurements - measurements: - - Identifier: KubeletRuntimeOperationDurationWithoutPullImage - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: KubeletRuntimeOperationDurationWithoutPullImage - metricVersion: v1 - unit: s - dimensions: - - node - - operation_type - queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type!="pull_image"}[10m])) by (node, operation_type, le)) - - name: Sum - query: kubelet_runtime_operations_duration_seconds_sum{operation_type!="pull_image"} - - Identifier: KubeletRuntimeOperationDurationWithPullImage - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: KubeletRuntimeOperationDurationWithPullImage - metricVersion: v1 - unit: s - dimensions: - - node - - operation_type - queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(avg_over_time(kubelet_runtime_operations_duration_seconds_bucket{operation_type="pull_image"}[5m])) by (node, operation_type, le)) - - name: Sum - query: kubelet_runtime_operations_duration_seconds_sum{operation_type="pull_image"} diff --git a/modules/python/clusterloader2/image_pull/image_pull.py b/modules/python/clusterloader2/image_pull/image_pull.py deleted file mode 100644 index 676e9af8c8..0000000000 --- a/modules/python/clusterloader2/image_pull/image_pull.py +++ /dev/null @@ -1,198 +0,0 @@ -"""Image Pull performance test using ClusterLoader2.""" - -import argparse -import json -import os -from datetime import datetime, timezone - -from clusterloader2.utils import parse_xml_to_json, run_cl2_command, get_measurement -from utils.logger_config import get_logger, setup_logging - -setup_logging() -logger = get_logger(__name__) - - -def override_config_clusterloader2(cl2_config_dir: str, provider: str): - """Override CL2 config file with Prometheus configuration.""" - override_file = os.path.join(cl2_config_dir, "overrides.yaml") - with open(override_file, "w", encoding="utf-8") as file: - file.write(f"CL2_PROVIDER: {provider}\n") - file.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") - file.write("CL2_PROMETHEUS_CPU_SCALE_FACTOR: 30.0\n") - file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 30.0\n") - file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n") - file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n") - logger.info(f"Wrote overrides file: {override_file}") - - -def execute_clusterloader2( - cl2_image: str, - cl2_config_dir: str, - cl2_report_dir: str, - kubeconfig: str, - provider: str -): - """Execute ClusterLoader2 image-pull test.""" - logger.info(f"Starting image-pull test with CL2 image: {cl2_image}") - logger.info(f"Config dir: {cl2_config_dir}, Report dir: {cl2_report_dir}") - - run_cl2_command( - kubeconfig=kubeconfig, - cl2_image=cl2_image, - cl2_config_dir=cl2_config_dir, - cl2_report_dir=cl2_report_dir, - provider=provider, - cl2_config_file="image-pull.yaml", - overrides=True, - enable_prometheus=True, - scrape_kubelets=True, - scrape_containerd=True, - tear_down_prometheus=False - ) - - logger.info(f"Test completed. Results in: {cl2_report_dir}") - - -def collect_clusterloader2( - cl2_report_dir: str, - cloud_info: str, - run_id: str, - run_url: str, - result_file: str, - deployment_count: int = 10, - replicas: int = 1 -): - """Collect and format image-pull test results for Kusto ingestion.""" - logger.info(f"Collecting results from: {cl2_report_dir}") - - details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent=2) - json_data = json.loads(details) - testsuites = json_data["testsuites"] - - if testsuites: - status = "success" if testsuites[0]["failures"] == 0 else "failure" - else: - raise ValueError(f"No testsuites found in the report! Raw data: {details}") - - template = { - "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), - "deployment_count": deployment_count, - "replicas": replicas, - "total_pods": deployment_count * replicas, - "status": status, - "group": None, - "measurement": None, - "percentile": None, - "data": None, - "cloud_info": cloud_info, - "run_id": run_id, - "run_url": run_url - } - - content = "" - for f in os.listdir(cl2_report_dir): - file_path = os.path.join(cl2_report_dir, f) - if not file_path.endswith('.json'): - continue - - with open(file_path, 'r', encoding='utf-8') as file: - measurement, group_name = get_measurement(file_path) - if not measurement: - continue - - logger.info(f"Processing measurement: {measurement}, group: {group_name}") - data = json.loads(file.read()) - - if "dataItems" in data: - items = data["dataItems"] - if not items: - logger.info(f"No data items found in {file_path}") - continue - for item in items: - template["measurement"] = measurement - template["group"] = group_name - template["percentile"] = "dataItems" - template["data"] = item - content += json.dumps(template) + "\n" - - os.makedirs(os.path.dirname(result_file), exist_ok=True) - with open(result_file, 'w', encoding='utf-8') as file: - file.write(content) - - logger.info(f"Results written to: {result_file}") - - -def main(): - """CLI entry point with subcommands.""" - parser = argparse.ArgumentParser(description="Image Pull performance test") - subparsers = parser.add_subparsers(dest="command") - - # Override subcommand - parser_override = subparsers.add_parser("override", help="Override CL2 config file") - parser_override.add_argument("--cl2_config_dir", type=str, required=True, - help="Path to CL2 config directory") - parser_override.add_argument("--provider", type=str, required=True, - help="Cloud provider (aks, eks, gke)") - - # Execute subcommand - parser_execute = subparsers.add_parser("execute", help="Execute image-pull test") - parser_execute.add_argument("--cl2_image", type=str, required=True, - help="CL2 Docker image") - parser_execute.add_argument("--cl2_config_dir", type=str, required=True, - help="Path to CL2 config directory") - parser_execute.add_argument("--cl2_report_dir", type=str, required=True, - help="Path to CL2 report directory") - parser_execute.add_argument("--kubeconfig", type=str, - default=os.path.expanduser("~/.kube/config"), - help="Path to kubeconfig file") - parser_execute.add_argument("--provider", type=str, required=True, - help="Cloud provider (aks, eks, gke)") - - # Collect subcommand - parser_collect = subparsers.add_parser("collect", help="Collect test results") - parser_collect.add_argument("--cl2_report_dir", type=str, required=True, - help="Path to CL2 report directory") - parser_collect.add_argument("--cloud_info", type=str, required=True, - help="Cloud information JSON") - parser_collect.add_argument("--run_id", type=str, required=True, - help="Pipeline run ID") - parser_collect.add_argument("--run_url", type=str, required=True, - help="Pipeline run URL") - parser_collect.add_argument("--result_file", type=str, required=True, - help="Path to output result file") - parser_collect.add_argument("--deployment_count", type=int, default=10, - help="Number of deployments") - parser_collect.add_argument("--replicas", type=int, default=1, - help="Replicas per deployment") - - args = parser.parse_args() - - if args.command == "override": - override_config_clusterloader2( - cl2_config_dir=args.cl2_config_dir, - provider=args.provider - ) - elif args.command == "execute": - execute_clusterloader2( - cl2_image=args.cl2_image, - cl2_config_dir=args.cl2_config_dir, - cl2_report_dir=args.cl2_report_dir, - kubeconfig=args.kubeconfig, - provider=args.provider - ) - elif args.command == "collect": - collect_clusterloader2( - cl2_report_dir=args.cl2_report_dir, - cloud_info=args.cloud_info, - run_id=args.run_id, - run_url=args.run_url, - result_file=args.result_file, - deployment_count=args.deployment_count, - replicas=args.replicas - ) - else: - parser.print_help() - - -if __name__ == "__main__": - main() diff --git a/modules/python/tests/test_cri.py b/modules/python/tests/test_cri.py index 4c1ab63709..7f3daa22d9 100644 --- a/modules/python/tests/test_cri.py +++ b/modules/python/tests/test_cri.py @@ -63,6 +63,7 @@ def test_override_config_clusterloader2(self, mock_kubernetes_client, mock_open) os_type="linux", scrape_kubelets=True, host_network=True, + scrape_containerd=False, override_file="/mock/override.yaml" ) @@ -117,6 +118,7 @@ def test_override_config_clusterloader2_host_network_false(self, mock_kubernetes os_type="linux", scrape_kubelets=False, host_network=False, + scrape_containerd=False, override_file="/mock/override.yaml" ) @@ -135,13 +137,14 @@ def test_execute_clusterloader2(self, mock_run_cl2_command): cl2_report_dir="/mock/report", kubeconfig="/mock/kubeconfig", provider="aks", - scrape_kubelets=True + scrape_kubelets=True, + scrape_containerd=False ) # Verify the command execution mock_run_cl2_command.assert_called_once_with( "/mock/kubeconfig", "mock-image", "/mock/config", "/mock/report", "aks", - overrides=True, enable_prometheus=True, tear_down_prometheus=False, scrape_kubelets=True + overrides=True, enable_prometheus=True, tear_down_prometheus=False, scrape_kubelets=True, scrape_containerd=False ) @patch('clusterloader2.cri.cri.KubernetesClient') @@ -231,12 +234,13 @@ def test_override_command(self, mock_override): "--os_type", "linux", "--scrape_kubelets", "False", "--host_network", "False", + "--scrape_containerd", "False", "--cl2_override_file", "/tmp/override.yaml" ] with patch.object(sys, 'argv', test_args): main() mock_override.assert_called_once_with( - 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "linux", False, False, "/tmp/override.yaml" + 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "linux", False, False, False, "/tmp/override.yaml" ) @patch("clusterloader2.cri.cri.override_config_clusterloader2") @@ -260,7 +264,7 @@ def test_override_command_default_host_network(self, mock_override): with patch.object(sys, 'argv', test_args): main() mock_override.assert_called_once_with( - 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "linux", False, True, "/tmp/override.yaml" + 5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "linux", False, False, True, "/tmp/override.yaml" ) @patch("clusterloader2.cri.cri.execute_clusterloader2") @@ -272,13 +276,14 @@ def test_execute_command(self, mock_execute): "--cl2_report_dir", "/reports", "--kubeconfig", "/home/user/.kube/config", "--provider", "gcp", - "--scrape_kubelets", "True" + "--scrape_kubelets", "True", + "--scrape_containerd", "False" ] with patch.object(sys, 'argv', test_args): main() mock_execute.assert_called_once_with( "gcr.io/cl2:latest", "/configs", "/reports", - "/home/user/.kube/config", "gcp", True + "/home/user/.kube/config", "gcp", True, False ) @patch("clusterloader2.cri.cri.collect_clusterloader2") diff --git a/modules/python/tests/test_image_pull.py b/modules/python/tests/test_image_pull.py deleted file mode 100644 index 1f55be92a5..0000000000 --- a/modules/python/tests/test_image_pull.py +++ /dev/null @@ -1,215 +0,0 @@ -"""Unit tests for image_pull module.""" - -import json -import os -import tempfile -import unittest -from unittest.mock import patch - -from clusterloader2.image_pull.image_pull import ( - execute_clusterloader2, - collect_clusterloader2, - override_config_clusterloader2, - main -) - - -class TestImagePullFunctions(unittest.TestCase): - """Test cases for image_pull execute and collect functions.""" - - def test_override_config_clusterloader2(self): - """Test override_config_clusterloader2 creates correct override file.""" - with tempfile.TemporaryDirectory() as tmpdir: - override_config_clusterloader2(tmpdir, "aks") - - override_file = os.path.join(tmpdir, "overrides.yaml") - self.assertTrue(os.path.exists(override_file)) - - with open(override_file, 'r', encoding='utf-8') as f: - content = f.read() - - self.assertIn("CL2_PROVIDER: aks", content) - self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content) - self.assertIn("CL2_PROMETHEUS_NODE_SELECTOR", content) - - @patch('clusterloader2.image_pull.image_pull.run_cl2_command') - def test_execute_clusterloader2(self, mock_run_cl2): - """Test execute_clusterloader2 calls run_cl2_command with correct params.""" - execute_clusterloader2( - cl2_image="ghcr.io/azure/clusterloader2:v20250311", - cl2_config_dir="/tmp/config", - cl2_report_dir="/tmp/report", - kubeconfig="/tmp/kubeconfig", - provider="aks" - ) - - mock_run_cl2.assert_called_once_with( - kubeconfig="/tmp/kubeconfig", - cl2_image="ghcr.io/azure/clusterloader2:v20250311", - cl2_config_dir="/tmp/config", - cl2_report_dir="/tmp/report", - provider="aks", - cl2_config_file="image-pull.yaml", - overrides=True, - enable_prometheus=True, - scrape_kubelets=True, - scrape_containerd=True, - tear_down_prometheus=False - ) - - @patch('clusterloader2.image_pull.image_pull.get_measurement') - @patch('clusterloader2.image_pull.image_pull.parse_xml_to_json') - def test_collect_clusterloader2_success(self, mock_parse_xml, mock_get_measurement): - """Test collect_clusterloader2 with successful test results.""" - # Mock junit.xml parsing - success case - mock_parse_xml.return_value = json.dumps({ - "testsuites": [{"failures": 0, "tests": 1}] - }) - - with tempfile.TemporaryDirectory() as tmpdir: - report_dir = os.path.join(tmpdir, "report") - os.makedirs(report_dir) - - # Create a mock measurement file - measurement_file = os.path.join(report_dir, "ImagePullLatency_test.json") - with open(measurement_file, 'w', encoding='utf-8') as f: - json.dump({ - "dataItems": [ - {"labels": {"node": "node1"}, "data": {"P50": 1.5, "P99": 3.0}} - ] - }, f) - - # Create junit.xml (required by parse_xml_to_json) - junit_file = os.path.join(report_dir, "junit.xml") - with open(junit_file, 'w', encoding='utf-8') as f: - f.write("") - - mock_get_measurement.return_value = ("ImagePullLatency", "test") - - result_file = os.path.join(tmpdir, "results", "output.json") - - collect_clusterloader2( - cl2_report_dir=report_dir, - cloud_info='{"cloud": "azure", "region": "eastus2"}', - run_id="12345", - run_url="https://dev.azure.com/run/12345", - result_file=result_file, - deployment_count=10, - replicas=1 - ) - - # Verify result file was created - self.assertTrue(os.path.exists(result_file)) - - # Verify content - with open(result_file, 'r', encoding='utf-8') as f: - content = f.read() - self.assertIn("ImagePullLatency", content) - self.assertIn("success", content) - self.assertIn("12345", content) - - @patch('clusterloader2.image_pull.image_pull.parse_xml_to_json') - def test_collect_clusterloader2_failure(self, mock_parse_xml): - """Test collect_clusterloader2 with failed test results.""" - # Mock junit.xml parsing - failure case - mock_parse_xml.return_value = json.dumps({ - "testsuites": [{"failures": 1, "tests": 1}] - }) - - with tempfile.TemporaryDirectory() as tmpdir: - report_dir = os.path.join(tmpdir, "report") - os.makedirs(report_dir) - - # Create junit.xml - junit_file = os.path.join(report_dir, "junit.xml") - with open(junit_file, 'w', encoding='utf-8') as f: - f.write("") - - result_file = os.path.join(tmpdir, "results", "output.json") - - collect_clusterloader2( - cl2_report_dir=report_dir, - cloud_info='{"cloud": "azure"}', - run_id="12345", - run_url="https://dev.azure.com/run/12345", - result_file=result_file - ) - - # Result file should exist even for failures - self.assertTrue(os.path.exists(result_file)) - - @patch('clusterloader2.image_pull.image_pull.parse_xml_to_json') - def test_collect_clusterloader2_no_testsuites(self, mock_parse_xml): - """Test collect_clusterloader2 raises error when no testsuites found.""" - # Mock junit.xml with empty testsuites - mock_parse_xml.return_value = json.dumps({"testsuites": []}) - - with tempfile.TemporaryDirectory() as tmpdir: - report_dir = os.path.join(tmpdir, "report") - os.makedirs(report_dir) - - junit_file = os.path.join(report_dir, "junit.xml") - with open(junit_file, 'w', encoding='utf-8') as f: - f.write("") - - result_file = os.path.join(tmpdir, "results", "output.json") - - with self.assertRaises(ValueError) as context: - collect_clusterloader2( - cl2_report_dir=report_dir, - cloud_info='{"cloud": "azure"}', - run_id="12345", - run_url="https://dev.azure.com/run/12345", - result_file=result_file - ) - - self.assertIn("No testsuites found", str(context.exception)) - - -class TestImagePullMain(unittest.TestCase): - """Test cases for CLI main function.""" - - @patch('clusterloader2.image_pull.image_pull.execute_clusterloader2') - def test_main_execute_command(self, mock_execute): - """Test main function with execute subcommand.""" - test_args = [ - 'image_pull.py', 'execute', - '--cl2_image', 'ghcr.io/azure/clusterloader2:v20250311', - '--cl2_config_dir', '/tmp/config', - '--cl2_report_dir', '/tmp/report', - '--provider', 'aks' - ] - - with patch('sys.argv', test_args): - main() - - mock_execute.assert_called_once() - call_kwargs = mock_execute.call_args[1] - self.assertEqual(call_kwargs['cl2_image'], 'ghcr.io/azure/clusterloader2:v20250311') - self.assertEqual(call_kwargs['provider'], 'aks') - - @patch('clusterloader2.image_pull.image_pull.collect_clusterloader2') - def test_main_collect_command(self, mock_collect): - """Test main function with collect subcommand.""" - test_args = [ - 'image_pull.py', 'collect', - '--cl2_report_dir', '/tmp/report', - '--cloud_info', '{"cloud": "azure"}', - '--run_id', '12345', - '--run_url', 'https://dev.azure.com/run/12345', - '--result_file', '/tmp/result.json', - '--deployment_count', '10', - '--replicas', '1' - ] - - with patch('sys.argv', test_args): - main() - - mock_collect.assert_called_once() - call_kwargs = mock_collect.call_args[1] - self.assertEqual(call_kwargs['deployment_count'], 10) - self.assertEqual(call_kwargs['replicas'], 1) - - -if __name__ == '__main__': - unittest.main() diff --git a/pipelines/perf-eval/Image Pull Benchmark/image-pull-n10.yml b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml similarity index 77% rename from pipelines/perf-eval/Image Pull Benchmark/image-pull-n10.yml rename to pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml index 61ec9db132..168dbcf6ac 100644 --- a/pipelines/perf-eval/Image Pull Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml @@ -15,12 +15,12 @@ stages: - eastus2 engine: clusterloader2 engine_input: - image: "ghcr.io/azure/clusterloader2:v20250311" - topology: image-pull + image: "ghcr.io/azure/clusterloader2:v20250513" + topology: cri-resource-consume matrix: image-pull-10pods: - deployment_count: 10 - replicas: 1 + node_count: 10 + scrape_containerd: True kubernetes_version: "1.33" max_parallel: 1 credential_type: service_connection diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..168dbcf6ac 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,28 @@ trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: image-pull-n10 stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2_image_pull dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + topology: cri-resource-consume + matrix: + image-pull-10pods: + node_count: 10 + scrape_containerd: True + kubernetes_version: "1.33" + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 60 diff --git a/scenarios/perf-eval/image-pull-n10/README.md b/scenarios/perf-eval/image-pull-n10/README.md index a71fae787e..6f0f29c2ca 100644 --- a/scenarios/perf-eval/image-pull-n10/README.md +++ b/scenarios/perf-eval/image-pull-n10/README.md @@ -2,7 +2,7 @@ ## Overview -Deploys 10 pods with `imagePullPolicy: Always` to trigger image pulls, and collects containerd image pulling throughput (MB/s) and kubelet runtime operation duration metrics using CL2. +Measures containerd image pulling throughput (MB/s) and network plugin operation metrics using the CRI module with `scrape_containerd: True`. Uses the `cri-resource-consume` topology. ## Infrastructure @@ -15,24 +15,12 @@ Deploys 10 pods with `imagePullPolicy: Always` to trigger image pulls, and colle | Prometheus Pool | 1 x Standard_D8s_v3 | | User Pool | 10 x Standard_D4s_v3 | -## Node Pools +## Metrics Collected -| Pool | Purpose | Node Count | VM Size | Labels | -|------|---------|------------|---------|--------| -| default | System/critical addons | 3 | Standard_D4s_v3 | - | -| prompool | Prometheus monitoring | 1 | Standard_D8s_v3 | `prometheus=true` | -| userpool | Image pull tests | 10 | Standard_D4s_v3 | `image-pull-test=true` | - -## Network Configuration - -- VNet: `10.0.0.0/9` -- Pod CIDR: `10.0.0.0/9` -- Service CIDR: `192.168.0.0/16` -- DNS Service IP: `192.168.0.10` - -## Usage - -Tests are executed on nodes labeled with `image-pull-test=true` in the user pool. +- `ContainerdCriImagePullingThroughput` - Image pull throughput (MB/s) +- `ContainerdCriNetworkPluginOperations` - Network plugin operation duration +- `ContainerdCriSandboxCreateNetwork` - Sandbox network creation time +- `ContainerdCriSandboxDeleteNetwork` - Sandbox network deletion time ## References diff --git a/steps/engine/clusterloader2/cri/execute.yml b/steps/engine/clusterloader2/cri/execute.yml index 45456ee0b2..fc91138cd8 100644 --- a/steps/engine/clusterloader2/cri/execute.yml +++ b/steps/engine/clusterloader2/cri/execute.yml @@ -24,6 +24,7 @@ steps: --provider $CLOUD \ --os_type ${OS_TYPE:-linux} \ --scrape_kubelets ${SCRAPE_KUBELETS:-False} \ + --scrape_containerd ${SCRAPE_CONTAINERD:-False} \ --host_network ${HOST_NETWORK:-True} \ --cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ @@ -32,7 +33,8 @@ steps: --cl2_report_dir $CL2_REPORT_DIR \ --kubeconfig ${HOME}/.kube/config \ --provider $CLOUD \ - --scrape_kubelets ${SCRAPE_KUBELETS:-False} + --scrape_kubelets ${SCRAPE_KUBELETS:-False} \ + --scrape_containerd ${SCRAPE_CONTAINERD:-False} workingDirectory: modules/python env: ${{ if eq(parameters.cloud, 'azure') }}: diff --git a/steps/engine/clusterloader2/image_pull/collect.yml b/steps/engine/clusterloader2/image_pull/collect.yml deleted file mode 100644 index 980ca997c3..0000000000 --- a/steps/engine/clusterloader2/image_pull/collect.yml +++ /dev/null @@ -1,32 +0,0 @@ -parameters: - - name: cloud - type: string - default: "" - - name: engine_input - type: object - default: {} - - name: region - type: string - -steps: - - template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml - parameters: - region: ${{ parameters.region }} - - script: | - set -eo pipefail - - PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ - --cl2_report_dir $CL2_REPORT_DIR \ - --cloud_info "$CLOUD_INFO" \ - --run_id $RUN_ID \ - --run_url $RUN_URL \ - --result_file $TEST_RESULTS_FILE \ - --deployment_count ${DEPLOYMENT_COUNT:-10} \ - --replicas ${REPLICAS:-1} - workingDirectory: modules/python - env: - CLOUD: ${{ parameters.cloud }} - RUN_URL: $(RUN_URL) - PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/image_pull.py - CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/results - displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/image_pull/execute.yml b/steps/engine/clusterloader2/image_pull/execute.yml deleted file mode 100644 index 877a3060be..0000000000 --- a/steps/engine/clusterloader2/image_pull/execute.yml +++ /dev/null @@ -1,52 +0,0 @@ -parameters: - - name: cloud - type: string - default: "" - - name: engine_input - type: object - default: {} - - name: region - type: string - -steps: - - script: | - set -eo pipefail - - PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE override \ - --cl2_config_dir ${CL2_CONFIG_DIR} \ - --provider $CLOUD - workingDirectory: modules/python - env: - ${{ if eq(parameters.cloud, 'azure') }}: - CLOUD: aks - ${{ elseif eq(parameters.cloud, 'aws') }}: - CLOUD: eks - ${{ else }}: - CLOUD: ${{ parameters.cloud }} - PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/image_pull.py - CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/config - displayName: "Override CL2 Config" - - - script: | - set -eo pipefail - - PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ - --cl2_image ${CL2_IMAGE} \ - --cl2_config_dir ${CL2_CONFIG_DIR} \ - --cl2_report_dir $CL2_REPORT_DIR \ - --kubeconfig ${HOME}/.kube/config \ - --provider $CLOUD - workingDirectory: modules/python - env: - ${{ if eq(parameters.cloud, 'azure') }}: - CLOUD: aks - ${{ elseif eq(parameters.cloud, 'aws') }}: - CLOUD: eks - ${{ else }}: - CLOUD: ${{ parameters.cloud }} - REGION: ${{ parameters.region }} - PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/image_pull.py - CL2_IMAGE: ${{ parameters.engine_input.image }} - CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/config - CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/image_pull/results - displayName: "Run Image Pull Benchmark" diff --git a/steps/topology/image-pull/collect-clusterloader2.yml b/steps/topology/image-pull/collect-clusterloader2.yml deleted file mode 100644 index 4e1fb387c6..0000000000 --- a/steps/topology/image-pull/collect-clusterloader2.yml +++ /dev/null @@ -1,17 +0,0 @@ -parameters: -- name: cloud - type: string - default: '' -- name: engine_input - type: object - default: {} -- name: regions - type: object - default: {} - -steps: -- template: /steps/engine/clusterloader2/image_pull/collect.yml - parameters: - cloud: ${{ parameters.cloud }} - engine_input: ${{ parameters.engine_input }} - region: ${{ parameters.regions[0] }} diff --git a/steps/topology/image-pull/execute-clusterloader2.yml b/steps/topology/image-pull/execute-clusterloader2.yml deleted file mode 100644 index dfa809aff1..0000000000 --- a/steps/topology/image-pull/execute-clusterloader2.yml +++ /dev/null @@ -1,17 +0,0 @@ -parameters: -- name: cloud - type: string - default: '' -- name: engine_input - type: object - default: {} -- name: regions - type: object - default: {} - -steps: -- template: /steps/engine/clusterloader2/image_pull/execute.yml - parameters: - cloud: ${{ parameters.cloud }} - engine_input: ${{ parameters.engine_input }} - region: ${{ parameters.regions[0] }} diff --git a/steps/topology/image-pull/validate-resources.yml b/steps/topology/image-pull/validate-resources.yml deleted file mode 100644 index 35b420318a..0000000000 --- a/steps/topology/image-pull/validate-resources.yml +++ /dev/null @@ -1,19 +0,0 @@ -parameters: -- name: cloud - type: string -- name: engine - type: string -- name: regions - type: object - -steps: -- template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml - parameters: - role: client - region: ${{ parameters.regions[0] }} -- script: | - set -eo pipefail - echo "Validating cluster resources..." - kubectl get nodes -o wide - echo "Cluster is ready for image-pull test" - displayName: "Validate Resources" From 57d043a321859cfb01d3bebaa9580a92b1721375 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 15 Dec 2025 05:00:19 +0000 Subject: [PATCH 11/30] fix: add missing CRI matrix parameters --- pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml | 7 ++++++- pipelines/system/new-pipeline-test.yml | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml index 168dbcf6ac..8187729ca2 100644 --- a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml @@ -20,8 +20,13 @@ stages: matrix: image-pull-10pods: node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: memory scrape_containerd: True - kubernetes_version: "1.33" + scrape_kubelets: False + kubernetes_version: "1.31" max_parallel: 1 credential_type: service_connection ssh_key_enabled: false diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 168dbcf6ac..8187729ca2 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -20,8 +20,13 @@ stages: matrix: image-pull-10pods: node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: memory scrape_containerd: True - kubernetes_version: "1.33" + scrape_kubelets: False + kubernetes_version: "1.31" max_parallel: 1 credential_type: service_connection ssh_key_enabled: false From b106a5bafe3e98e4975720f7356446f55aa0bf33 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 15 Dec 2025 05:05:36 +0000 Subject: [PATCH 12/30] fix: enable scrape_kubelets for kubelet metrics --- pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml | 2 +- pipelines/system/new-pipeline-test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml index 8187729ca2..246d1d7e99 100644 --- a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml @@ -25,7 +25,7 @@ stages: operation_timeout: 3m load_type: memory scrape_containerd: True - scrape_kubelets: False + scrape_kubelets: True kubernetes_version: "1.31" max_parallel: 1 credential_type: service_connection diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 8187729ca2..246d1d7e99 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -25,7 +25,7 @@ stages: operation_timeout: 3m load_type: memory scrape_containerd: True - scrape_kubelets: False + scrape_kubelets: True kubernetes_version: "1.31" max_parallel: 1 credential_type: service_connection From 1cff4492e4a8209c847771eef9c8c7dcb8872a6a Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 15 Dec 2025 05:46:03 +0000 Subject: [PATCH 13/30] fix: use kubernetes_version 1.33 --- pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml | 2 +- pipelines/system/new-pipeline-test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml index 246d1d7e99..773fdfcd60 100644 --- a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml @@ -26,7 +26,7 @@ stages: load_type: memory scrape_containerd: True scrape_kubelets: True - kubernetes_version: "1.31" + kubernetes_version: "1.33" max_parallel: 1 credential_type: service_connection ssh_key_enabled: false diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 246d1d7e99..773fdfcd60 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -26,7 +26,7 @@ stages: load_type: memory scrape_containerd: True scrape_kubelets: True - kubernetes_version: "1.31" + kubernetes_version: "1.33" max_parallel: 1 credential_type: service_connection ssh_key_enabled: false From 31d18c7d63d35b232e599878453e377cb77c6a55 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 15 Dec 2025 05:58:52 +0000 Subject: [PATCH 14/30] fix: use cri-resource-consume label expected by CRI module --- .../perf-eval/image-pull-n10/terraform-inputs/azure.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars index acfc1f2185..f23ade6d84 100644 --- a/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars @@ -57,7 +57,7 @@ aks_config_list = [ auto_scaling_enabled = false vm_size = "Standard_D4s_v3" os_disk_type = "Managed" - node_labels = { "image-pull-test" = "true" } + node_labels = { "cri-resource-consume" = "true" } } ] } From 24157e5e5b0d002ee73056dbe8b990f84744b180 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 16 Dec 2025 06:13:46 +0000 Subject: [PATCH 15/30] Remove irrelevant CNI metrics from CRI containerd measurements, use avg throughput instead of percentiles --- .../cri/config/containerd-measurements.yaml | 46 +------------------ 1 file changed, 2 insertions(+), 44 deletions(-) diff --git a/modules/python/clusterloader2/cri/config/containerd-measurements.yaml b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml index 2c33ffae1f..b0102a3afe 100644 --- a/modules/python/clusterloader2/cri/config/containerd-measurements.yaml +++ b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml @@ -11,49 +11,7 @@ steps: metricVersion: v1 unit: MB/s queries: - - name: Perc99 - query: histogram_quantile(0.99, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc90 - query: histogram_quantile(0.90, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) - - name: Perc50 - query: histogram_quantile(0.50, sum(rate(containerd_cri_image_pulling_throughput_bucket{nodepool=~"userpool.*"}[%v])) by (le)) + - name: Avg + query: sum(rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])) - name: Count query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) - - Identifier: ContainerdCriNetworkPluginOperations - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: ContainerdCriNetworkPluginOperations - metricVersion: v1 - unit: s - dimensions: - - operation_type - queries: - - name: Sum - query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_sum{nodepool=~"userpool.*"}) by (operation_type) - - name: Count - query: sum(containerd_cri_network_plugin_operations_duration_seconds_seconds_count{nodepool=~"userpool.*"}) by (operation_type) - - Identifier: ContainerdCriSandboxCreateNetwork - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: ContainerdCriSandboxCreateNetwork - metricVersion: v1 - unit: s - queries: - - name: Sum - query: sum(containerd_cri_sandbox_create_network_seconds_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_sandbox_create_network_seconds_count{nodepool=~"userpool.*"}) - - Identifier: ContainerdCriSandboxDeleteNetwork - Method: GenericPrometheusQuery - Params: - action: {{$action}} - metricName: ContainerdCriSandboxDeleteNetwork - metricVersion: v1 - unit: s - queries: - - name: Sum - query: sum(containerd_cri_sandbox_delete_network_seconds_sum{nodepool=~"userpool.*"}) - - name: Count - query: sum(containerd_cri_sandbox_delete_network_seconds_count{nodepool=~"userpool.*"}) From 612a54794a34a71f3577d41d40a9dfe2d244bccb Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Fri, 19 Dec 2025 01:15:30 +0000 Subject: [PATCH 16/30] Add percentile metrics for image pull throughput --- .../cri/config/containerd-measurements.yaml | 12 ++++++++++++ scenarios/perf-eval/image-pull-n10/README.md | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/modules/python/clusterloader2/cri/config/containerd-measurements.yaml b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml index b0102a3afe..57079ce806 100644 --- a/modules/python/clusterloader2/cri/config/containerd-measurements.yaml +++ b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml @@ -11,7 +11,19 @@ steps: metricVersion: v1 unit: MB/s queries: + # Weighted average throughput per image pull (nodes with more pulls have more weight) - name: Avg query: sum(rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])) + # Unweighted average - each node contributes equally regardless of pull count + - name: AvgPerNode + query: avg(sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v]))) + # Total number of image pulls - name: Count query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) + # Cluster level percentiles - throughput distribution across nodes + - name: Perc50 + query: quantile(0.5, sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v]))) + - name: Perc90 + query: quantile(0.9, sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v]))) + - name: Perc99 + query: quantile(0.99, sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v]))) diff --git a/scenarios/perf-eval/image-pull-n10/README.md b/scenarios/perf-eval/image-pull-n10/README.md index 6f0f29c2ca..cf09991eb0 100644 --- a/scenarios/perf-eval/image-pull-n10/README.md +++ b/scenarios/perf-eval/image-pull-n10/README.md @@ -17,10 +17,18 @@ Measures containerd image pulling throughput (MB/s) and network plugin operation ## Metrics Collected -- `ContainerdCriImagePullingThroughput` - Image pull throughput (MB/s) -- `ContainerdCriNetworkPluginOperations` - Network plugin operation duration -- `ContainerdCriSandboxCreateNetwork` - Sandbox network creation time -- `ContainerdCriSandboxDeleteNetwork` - Sandbox network deletion time +### ContainerdCriImagePullingThroughput + +Image pull throughput (MB/s) with the following aggregations: + +| Metric | Description | +|--------|-------------| +| **Avg** | Weighted average throughput per image pull | +| **AvgPerNode** | Unweighted average - each node contributes equally | +| **Count** | Total number of image pulls | +| **Perc50** | 50th percentile (median) throughput across nodes | +| **Perc90** | 90th percentile throughput across nodes | +| **Perc99** | 99th percentile throughput across nodes | ## References From 6d1eaa1ca64605f674a14e1aeab8fda67e694c5a Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Sun, 21 Dec 2025 22:34:48 +0000 Subject: [PATCH 17/30] Revert new-pipeline-test.yml --- pipelines/system/new-pipeline-test.yml | 40 +++++++++++--------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 773fdfcd60..63d55f02d9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,33 +1,25 @@ trigger: none variables: - SCENARIO_TYPE: perf-eval - SCENARIO_NAME: image-pull-n10 + SCENARIO_TYPE: + SCENARIO_NAME: stages: - - stage: azure_eastus2_image_pull + - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) dependsOn: [] jobs: - - template: /jobs/competitive-test.yml + - template: /jobs/competitive-test.yml # must keep as is parameters: - cloud: azure - regions: - - eastus2 - engine: clusterloader2 - engine_input: - image: "ghcr.io/azure/clusterloader2:v20250513" - topology: cri-resource-consume - matrix: - image-pull-10pods: - node_count: 10 - max_pods: 30 - repeats: 1 - operation_timeout: 3m - load_type: memory - scrape_containerd: True - scrape_kubelets: True - kubernetes_version: "1.33" - max_parallel: 1 - credential_type: service_connection + cloud: # e.g. azure, aws + regions: # list of regions + - region1 # e.g. eastus2 + topology: # e.g. cluster-autoscaler + engine: # e.g. clusterloader2 + matrix: # list of test parameters to customize the provisioned resources + : + : + : + max_parallel: # required + credential_type: service_connection # required ssh_key_enabled: false - timeout_in_minutes: 60 + timeout_in_minutes: 60 # if not specified, default is 60 From 860c7e69b4569d79f0e5ebe1885938d9c29f43a3 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 30 Dec 2025 01:25:10 +0000 Subject: [PATCH 18/30] Add scrape_containerd parameter to collect phase for CRI engine --- modules/python/clusterloader2/cri/cri.py | 13 +++++-- modules/python/tests/test_cri.py | 11 +++--- pipelines/system/new-pipeline-test.yml | 40 ++++++++++++--------- steps/engine/clusterloader2/cri/collect.yml | 3 +- 4 files changed, 44 insertions(+), 23 deletions(-) diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index 85b4e725fd..637c49396c 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -142,9 +142,10 @@ def collect_clusterloader2( run_id, run_url, result_file, - scrape_kubelets + scrape_kubelets, + scrape_containerd ): - if scrape_kubelets: + if scrape_kubelets or scrape_containerd: verify_measurement() details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2) @@ -343,6 +344,13 @@ def main(): default=False, help="Whether to scrape kubelets", ) + parser_collect.add_argument( + "--scrape_containerd", + type=str2bool, + choices=[True, False], + default=False, + help="Whether to scrape containerd", + ) args = parser.parse_args() @@ -385,6 +393,7 @@ def main(): args.run_url, args.result_file, args.scrape_kubelets, + args.scrape_containerd, ) if __name__ == "__main__": diff --git a/modules/python/tests/test_cri.py b/modules/python/tests/test_cri.py index 7f3daa22d9..c2d001e98e 100644 --- a/modules/python/tests/test_cri.py +++ b/modules/python/tests/test_cri.py @@ -188,7 +188,8 @@ def test_collect_clusterloader2(self): run_id="12345", run_url="http://example.com", result_file=result_file, - scrape_kubelets=False + scrape_kubelets=False, + scrape_containerd=False ) self.assertTrue(os.path.exists(result_file)) @@ -213,7 +214,8 @@ def test_collect_clusterloader2_no_testsuites(self, mock_parse_xml_to_json): run_id="12345", run_url="http://example.com", result_file="/mock/result.json", - scrape_kubelets=False + scrape_kubelets=False, + scrape_containerd=False ) self.assertIn("No testsuites found in the report", str(context.exception)) @@ -299,13 +301,14 @@ def test_collect_command(self, mock_collect): "--run_id", "run-123", "--run_url", "https://run.url", "--result_file", "/tmp/results.json", - "--scrape_kubelets", "False" + "--scrape_kubelets", "False", + "--scrape_containerd", "False" ] with patch.object(sys, 'argv', test_args): main() mock_collect.assert_called_once_with( 3, 100, 5, "memory", "/reports", "gcp-zone", "run-123", - "https://run.url", "/tmp/results.json", False + "https://run.url", "/tmp/results.json", False, False ) if __name__ == '__main__': diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..5f22b06901 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,33 @@ trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: image-pull-n10 stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2_image_pull_test dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + topology: cri-resource-consume + matrix: + image-pull-10pods: + node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: memory + scrape_containerd: True + scrape_kubelets: True + kubernetes_version: "1.33" + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 60 diff --git a/steps/engine/clusterloader2/cri/collect.yml b/steps/engine/clusterloader2/cri/collect.yml index 19e47820ea..528995e3e9 100644 --- a/steps/engine/clusterloader2/cri/collect.yml +++ b/steps/engine/clusterloader2/cri/collect.yml @@ -25,7 +25,8 @@ steps: --run_id $RUN_ID \ --run_url $RUN_URL \ --result_file $TEST_RESULTS_FILE \ - --scrape_kubelets ${SCRAPE_KUBELETS:-False} + --scrape_kubelets ${SCRAPE_KUBELETS:-False} \ + --scrape_containerd ${SCRAPE_CONTAINERD:-False} workingDirectory: modules/python env: CLOUD: ${{ parameters.cloud }} From 52463abe73d3fd0e395e25b56b91d36814e24ac0 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 30 Dec 2025 05:34:43 +0000 Subject: [PATCH 19/30] Add CONTAINERD_SCRAPE_INTERVAL: 30s for CRI scenario --- modules/python/clusterloader2/cri/cri.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index 637c49396c..4f9dcb17cb 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -91,6 +91,8 @@ def override_config_clusterloader2( file.write(f"CL2_OS_TYPE: {os_type}\n") file.write(f"CL2_SCRAPE_KUBELETS: {str(scrape_kubelets).lower()}\n") file.write(f"CL2_SCRAPE_CONTAINERD: {str(scrape_containerd).lower()}\n") + if scrape_containerd: + file.write("CONTAINERD_SCRAPE_INTERVAL: 30s\n") file.write(f"CL2_HOST_NETWORK: {str(host_network).lower()}\n") file.close() From 7b297212d6b280740f0b2b0a0cf2c68009b17209 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 5 Jan 2026 03:30:27 +0000 Subject: [PATCH 20/30] Remove scrape_containerd from collect phase as it has no effect --- modules/python/clusterloader2/cri/cri.py | 13 ++----------- modules/python/tests/test_cri.py | 11 ++++------- steps/engine/clusterloader2/cri/collect.yml | 3 +-- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index 4f9dcb17cb..12d89b23a5 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -144,10 +144,9 @@ def collect_clusterloader2( run_id, run_url, result_file, - scrape_kubelets, - scrape_containerd + scrape_kubelets ): - if scrape_kubelets or scrape_containerd: + if scrape_kubelets: verify_measurement() details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2) @@ -346,13 +345,6 @@ def main(): default=False, help="Whether to scrape kubelets", ) - parser_collect.add_argument( - "--scrape_containerd", - type=str2bool, - choices=[True, False], - default=False, - help="Whether to scrape containerd", - ) args = parser.parse_args() @@ -395,7 +387,6 @@ def main(): args.run_url, args.result_file, args.scrape_kubelets, - args.scrape_containerd, ) if __name__ == "__main__": diff --git a/modules/python/tests/test_cri.py b/modules/python/tests/test_cri.py index c2d001e98e..7f3daa22d9 100644 --- a/modules/python/tests/test_cri.py +++ b/modules/python/tests/test_cri.py @@ -188,8 +188,7 @@ def test_collect_clusterloader2(self): run_id="12345", run_url="http://example.com", result_file=result_file, - scrape_kubelets=False, - scrape_containerd=False + scrape_kubelets=False ) self.assertTrue(os.path.exists(result_file)) @@ -214,8 +213,7 @@ def test_collect_clusterloader2_no_testsuites(self, mock_parse_xml_to_json): run_id="12345", run_url="http://example.com", result_file="/mock/result.json", - scrape_kubelets=False, - scrape_containerd=False + scrape_kubelets=False ) self.assertIn("No testsuites found in the report", str(context.exception)) @@ -301,14 +299,13 @@ def test_collect_command(self, mock_collect): "--run_id", "run-123", "--run_url", "https://run.url", "--result_file", "/tmp/results.json", - "--scrape_kubelets", "False", - "--scrape_containerd", "False" + "--scrape_kubelets", "False" ] with patch.object(sys, 'argv', test_args): main() mock_collect.assert_called_once_with( 3, 100, 5, "memory", "/reports", "gcp-zone", "run-123", - "https://run.url", "/tmp/results.json", False, False + "https://run.url", "/tmp/results.json", False ) if __name__ == '__main__': diff --git a/steps/engine/clusterloader2/cri/collect.yml b/steps/engine/clusterloader2/cri/collect.yml index 528995e3e9..19e47820ea 100644 --- a/steps/engine/clusterloader2/cri/collect.yml +++ b/steps/engine/clusterloader2/cri/collect.yml @@ -25,8 +25,7 @@ steps: --run_id $RUN_ID \ --run_url $RUN_URL \ --result_file $TEST_RESULTS_FILE \ - --scrape_kubelets ${SCRAPE_KUBELETS:-False} \ - --scrape_containerd ${SCRAPE_CONTAINERD:-False} + --scrape_kubelets ${SCRAPE_KUBELETS:-False} workingDirectory: modules/python env: CLOUD: ${{ parameters.cloud }} From 4fb92058fb1327de7f52b0e76d1f40fee4f7ee16 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 6 Jan 2026 00:55:52 +0000 Subject: [PATCH 21/30] Add comment explaining why containerd metrics are not in verify_measurement --- modules/python/clusterloader2/cri/cri.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index 12d89b23a5..31712f033d 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -101,6 +101,8 @@ def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, overrides=True, enable_prometheus=True, tear_down_prometheus=False, scrape_kubelets=scrape_kubelets, scrape_containerd=scrape_containerd) +# Note: verify_measurement only checks kubelet metrics (accessible via node proxy endpoint). +# Containerd metrics are only available via Prometheus and cannot be verified here. def verify_measurement(): client = KubernetesClient(os.path.expanduser("~/.kube/config")) nodes = client.get_nodes(label_selector="cri-resource-consume=true") From e267e0e9459823b6cc9f5670fb385f9677a73f64 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 6 Jan 2026 03:09:21 +0000 Subject: [PATCH 22/30] Add test workload details to image-pull-n10 README --- scenarios/perf-eval/image-pull-n10/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scenarios/perf-eval/image-pull-n10/README.md b/scenarios/perf-eval/image-pull-n10/README.md index cf09991eb0..67dc068796 100644 --- a/scenarios/perf-eval/image-pull-n10/README.md +++ b/scenarios/perf-eval/image-pull-n10/README.md @@ -15,6 +15,14 @@ Measures containerd image pulling throughput (MB/s) and network plugin operation | Prometheus Pool | 1 x Standard_D8s_v3 | | User Pool | 10 x Standard_D4s_v3 | +## Test Workload + +| Component | Value | +|-----------|-------| +| Registry | Azure Container Registry (`akscritelescope.azurecr.io`) | +| Image | `e2e-test-images/resource-consumer:1.13` | +| Image Size | ~50MB | + ## Metrics Collected ### ContainerdCriImagePullingThroughput From 64fdd4ed03cfbfcc9f523fab4b3c243f0a81f1b0 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 6 Jan 2026 04:05:46 +0000 Subject: [PATCH 23/30] Move pipeline to ACR Benchmark folder and revert new-pipeline-test.yml --- .../image-pull-n10.yml | 2 +- pipelines/system/new-pipeline-test.yml | 40 ++++++++----------- 2 files changed, 17 insertions(+), 25 deletions(-) rename pipelines/perf-eval/{CRI Benchmark => ACR Benchmark}/image-pull-n10.yml (95%) diff --git a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml similarity index 95% rename from pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml rename to pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml index 773fdfcd60..93b9c02436 100644 --- a/pipelines/perf-eval/CRI Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml @@ -26,7 +26,7 @@ stages: load_type: memory scrape_containerd: True scrape_kubelets: True - kubernetes_version: "1.33" + kubernetes_version: "1.34" max_parallel: 1 credential_type: service_connection ssh_key_enabled: false diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 5f22b06901..63d55f02d9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,33 +1,25 @@ trigger: none variables: - SCENARIO_TYPE: perf-eval - SCENARIO_NAME: image-pull-n10 + SCENARIO_TYPE: + SCENARIO_NAME: stages: - - stage: azure_eastus2_image_pull_test + - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) dependsOn: [] jobs: - - template: /jobs/competitive-test.yml + - template: /jobs/competitive-test.yml # must keep as is parameters: - cloud: azure - regions: - - eastus2 - engine: clusterloader2 - engine_input: - image: "ghcr.io/azure/clusterloader2:v20250513" - topology: cri-resource-consume - matrix: - image-pull-10pods: - node_count: 10 - max_pods: 30 - repeats: 1 - operation_timeout: 3m - load_type: memory - scrape_containerd: True - scrape_kubelets: True - kubernetes_version: "1.33" - max_parallel: 1 - credential_type: service_connection + cloud: # e.g. azure, aws + regions: # list of regions + - region1 # e.g. eastus2 + topology: # e.g. cluster-autoscaler + engine: # e.g. clusterloader2 + matrix: # list of test parameters to customize the provisioned resources + : + : + : + max_parallel: # required + credential_type: service_connection # required ssh_key_enabled: false - timeout_in_minutes: 60 + timeout_in_minutes: 60 # if not specified, default is 60 From 0ee61fb7734e54923a6bad2259d668fb6d2997f9 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 6 Jan 2026 07:23:09 +0000 Subject: [PATCH 24/30] Add 20s pod_startup_latency_threshold --- .../ACR Benchmark/image-pull-n10.yml | 1 + pipelines/system/new-pipeline-test.yml | 41 +++++++++++-------- scenarios/perf-eval/image-pull-n10/README.md | 37 +++++++++++++++++ 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml index 93b9c02436..67e20fa692 100644 --- a/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml @@ -27,6 +27,7 @@ stages: scrape_containerd: True scrape_kubelets: True kubernetes_version: "1.34" + pod_startup_latency_threshold: 20s max_parallel: 1 credential_type: service_connection ssh_key_enabled: false diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..67e20fa692 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,34 @@ trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: image-pull-n10 stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2_image_pull dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + topology: cri-resource-consume + matrix: + image-pull-10pods: + node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: memory + scrape_containerd: True + scrape_kubelets: True + kubernetes_version: "1.34" + pod_startup_latency_threshold: 20s + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 60 diff --git a/scenarios/perf-eval/image-pull-n10/README.md b/scenarios/perf-eval/image-pull-n10/README.md index 67dc068796..5b9df159e6 100644 --- a/scenarios/perf-eval/image-pull-n10/README.md +++ b/scenarios/perf-eval/image-pull-n10/README.md @@ -38,6 +38,43 @@ Image pull throughput (MB/s) with the following aggregations: | **Perc90** | 90th percentile throughput across nodes | | **Perc99** | 99th percentile throughput across nodes | +## Known Limitations + +### Cannot Use histogram_quantile() Per Node + +Using Prometheus `histogram_quantile()` on per-node throughput data always returns `10` (the maximum bucket boundary) regardless of actual throughput values. This happens because: + +- The histogram has fixed bucket boundaries: `0.5, 1, 2, 4, 6, 8, 10` MB/s +- When actual throughput exceeds 10 MB/s, all samples fall into the `+Inf` bucket +- `histogram_quantile()` can only interpolate within defined buckets, so it caps at `10` + +**Current Approach**: Instead of `histogram_quantile()` per node, we use weighted average (`_sum / _count`) per node, then compute percentiles across the node averages. + +### Per-Node Metrics May Return "no samples" + +The per-node metrics (`AvgPerNode`, `Perc50`, `Perc90`, `Perc99`) may return "no samples" while aggregate metrics (`Avg`, `Count`) work correctly. This is caused by Prometheus `rate()` function requiring **at least 2 data points** within the query window. + +**Root Cause**: If image pulls complete faster than the Prometheus scrape interval (default 15s), only one data point is collected per pull operation. The `rate()` function cannot compute a rate from a single sample, resulting in empty per-node results. + +**Why Aggregate Metrics Work**: `Avg` and `Count` use `sum()` which aggregates samples across all pods/nodes before applying `rate()`, accumulating enough data points within the window. + +**Workaround Options**: +- Increase scrape frequency (may impact cluster performance) +- Use larger images that take longer to pull +- Rely on aggregate metrics (`Avg`, `Count`) for throughput analysis + +### Metric Includes Unpack Time + +The `containerd_cri_image_pulling_throughput` metric measures **total image size divided by total pull time**, which includes both: +- Image layer download time +- Image layer decompression/unpack time + +This is not a pure network throughput metric. See [containerd source](https://github.com/containerd/containerd/blob/main/internal/cri/server/images/image_pull.go). + +### verify_measurement() Cannot Check Containerd Metrics + +The CRI module's `verify_measurement()` function only validates kubelet metrics (accessible via Kubernetes node proxy endpoint at `/api/v1/nodes/{node}/proxy/metrics`). Containerd metrics are only available through the Prometheus server and cannot be verified through this endpoint. + ## References - [Best Practices](../../../docs/best-practices.md) From a4a5f7ad7b709a51b2eba3477e9c25a82c64698d Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 6 Jan 2026 23:10:19 +0000 Subject: [PATCH 25/30] test with scrape_containerd=False --- .../clusterloader2/cri/config/containerd-measurements.yaml | 2 +- pipelines/system/new-pipeline-test.yml | 2 +- .../perf-eval/image-pull-n10/terraform-inputs/azure.tfvars | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/python/clusterloader2/cri/config/containerd-measurements.yaml b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml index 57079ce806..c64324c525 100644 --- a/modules/python/clusterloader2/cri/config/containerd-measurements.yaml +++ b/modules/python/clusterloader2/cri/config/containerd-measurements.yaml @@ -17,7 +17,7 @@ steps: # Unweighted average - each node contributes equally regardless of pull count - name: AvgPerNode query: avg(sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v]))) - # Total number of image pulls + # Number of successful image pull observations - name: Count query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}) # Cluster level percentiles - throughput distribution across nodes diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 67e20fa692..e961f920ea 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -24,7 +24,7 @@ stages: repeats: 1 operation_timeout: 3m load_type: memory - scrape_containerd: True + scrape_containerd: False scrape_kubelets: True kubernetes_version: "1.34" pod_startup_latency_threshold: 20s diff --git a/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars index f23ade6d84..6e5d65fac4 100644 --- a/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars @@ -1,6 +1,6 @@ scenario_type = "perf-eval" scenario_name = "image-pull-n10" -deletion_delay = "2h" +deletion_delay = "1h" owner = "acr" network_config_list = [ From 12fc728036011e50ed4deda35ff8f10db2862822 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 6 Jan 2026 23:59:02 +0000 Subject: [PATCH 26/30] Revert new-pipeline-test.yml to template --- pipelines/system/new-pipeline-test.yml | 41 ++++++++++---------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index e961f920ea..63d55f02d9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,34 +1,25 @@ trigger: none variables: - SCENARIO_TYPE: perf-eval - SCENARIO_NAME: image-pull-n10 + SCENARIO_TYPE: + SCENARIO_NAME: stages: - - stage: azure_eastus2_image_pull + - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) dependsOn: [] jobs: - - template: /jobs/competitive-test.yml + - template: /jobs/competitive-test.yml # must keep as is parameters: - cloud: azure - regions: - - eastus2 - engine: clusterloader2 - engine_input: - image: "ghcr.io/azure/clusterloader2:v20250513" - topology: cri-resource-consume - matrix: - image-pull-10pods: - node_count: 10 - max_pods: 30 - repeats: 1 - operation_timeout: 3m - load_type: memory - scrape_containerd: False - scrape_kubelets: True - kubernetes_version: "1.34" - pod_startup_latency_threshold: 20s - max_parallel: 1 - credential_type: service_connection + cloud: # e.g. azure, aws + regions: # list of regions + - region1 # e.g. eastus2 + topology: # e.g. cluster-autoscaler + engine: # e.g. clusterloader2 + matrix: # list of test parameters to customize the provisioned resources + : + : + : + max_parallel: # required + credential_type: service_connection # required ssh_key_enabled: false - timeout_in_minutes: 60 + timeout_in_minutes: 60 # if not specified, default is 60 From 8d285c579b28543711f3e5853c760640a085b2ff Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Wed, 7 Jan 2026 04:04:14 +0000 Subject: [PATCH 27/30] feat: add 4-hour schedule trigger to image-pull-n10 pipeline --- pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml index 67e20fa692..b4c97f0813 100644 --- a/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml +++ b/pipelines/perf-eval/ACR Benchmark/image-pull-n10.yml @@ -1,4 +1,11 @@ trigger: none +schedules: + - cron: "0 */4 * * *" + displayName: "Every 4 Hour" + branches: + include: + - main + always: true variables: SCENARIO_TYPE: perf-eval From 78c933879b84f39e2b48c569889246c9339cbf98 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Wed, 7 Jan 2026 05:56:24 +0000 Subject: [PATCH 28/30] modify interval and vm size --- modules/python/clusterloader2/cri/cri.py | 2 +- .../perf-eval/image-pull-n10/terraform-inputs/azure.tfvars | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py index 31712f033d..fc7f322b48 100644 --- a/modules/python/clusterloader2/cri/cri.py +++ b/modules/python/clusterloader2/cri/cri.py @@ -92,7 +92,7 @@ def override_config_clusterloader2( file.write(f"CL2_SCRAPE_KUBELETS: {str(scrape_kubelets).lower()}\n") file.write(f"CL2_SCRAPE_CONTAINERD: {str(scrape_containerd).lower()}\n") if scrape_containerd: - file.write("CONTAINERD_SCRAPE_INTERVAL: 30s\n") + file.write("CONTAINERD_SCRAPE_INTERVAL: 15s\n") file.write(f"CL2_HOST_NETWORK: {str(host_network).lower()}\n") file.close() diff --git a/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars index 6e5d65fac4..e4699c3d8f 100644 --- a/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/image-pull-n10/terraform-inputs/azure.tfvars @@ -37,7 +37,7 @@ aks_config_list = [ default_node_pool = { name = "default" node_count = 3 - vm_size = "Standard_D4s_v3" + vm_size = "Standard_D4ds_v5" os_disk_type = "Managed" only_critical_addons_enabled = true temporary_name_for_rotation = "defaulttmp" @@ -47,7 +47,7 @@ aks_config_list = [ name = "prompool" node_count = 1 auto_scaling_enabled = false - vm_size = "Standard_D8s_v3" + vm_size = "Standard_D4ds_v5" os_disk_type = "Managed" node_labels = { "prometheus" = "true" } }, @@ -55,7 +55,7 @@ aks_config_list = [ name = "userpool" node_count = 10 auto_scaling_enabled = false - vm_size = "Standard_D4s_v3" + vm_size = "Standard_D4ds_v5" os_disk_type = "Managed" node_labels = { "cri-resource-consume" = "true" } } From 2b4af6ef49dc9720e2648162e15bed2b0f275e4e Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 12 Jan 2026 00:21:23 +0000 Subject: [PATCH 29/30] Test image-pull-n10 scenario in new-pipeline-test --- pipelines/system/new-pipeline-test.yml | 41 ++++++++++++++++---------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..67e20fa692 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,34 @@ trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: image-pull-n10 stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2_image_pull dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + topology: cri-resource-consume + matrix: + image-pull-10pods: + node_count: 10 + max_pods: 30 + repeats: 1 + operation_timeout: 3m + load_type: memory + scrape_containerd: True + scrape_kubelets: True + kubernetes_version: "1.34" + pod_startup_latency_threshold: 20s + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 60 From 87f847af3b1850ed3e54b4b86af42165995ae337 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Mon, 12 Jan 2026 00:51:39 +0000 Subject: [PATCH 30/30] Revert new-pipeline-test.yml to template --- pipelines/system/new-pipeline-test.yml | 41 ++++++++++---------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 67e20fa692..63d55f02d9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,34 +1,25 @@ trigger: none variables: - SCENARIO_TYPE: perf-eval - SCENARIO_NAME: image-pull-n10 + SCENARIO_TYPE: + SCENARIO_NAME: stages: - - stage: azure_eastus2_image_pull + - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) dependsOn: [] jobs: - - template: /jobs/competitive-test.yml + - template: /jobs/competitive-test.yml # must keep as is parameters: - cloud: azure - regions: - - eastus2 - engine: clusterloader2 - engine_input: - image: "ghcr.io/azure/clusterloader2:v20250513" - topology: cri-resource-consume - matrix: - image-pull-10pods: - node_count: 10 - max_pods: 30 - repeats: 1 - operation_timeout: 3m - load_type: memory - scrape_containerd: True - scrape_kubelets: True - kubernetes_version: "1.34" - pod_startup_latency_threshold: 20s - max_parallel: 1 - credential_type: service_connection + cloud: # e.g. azure, aws + regions: # list of regions + - region1 # e.g. eastus2 + topology: # e.g. cluster-autoscaler + engine: # e.g. clusterloader2 + matrix: # list of test parameters to customize the provisioned resources + : + : + : + max_parallel: # required + credential_type: service_connection # required ssh_key_enabled: false - timeout_in_minutes: 60 + timeout_in_minutes: 60 # if not specified, default is 60