Azure · jasminetMSFT · Jan 12, 2026 · Dec 8, 2025 · Dec 10, 2025 · Dec 10, 2025
@@ -21,6 +21,7 @@ name: resource-consumer
 {{$registryEndpoint := DefaultParam .CL2_REGISTRY_ENDPOINT "akscritelescope.azure.io" }}
 {{$osType := DefaultParam .CL2_OS_TYPE "linux"}}
 {{$scrapeKubelets := DefaultParam .CL2_SCRAPE_KUBELETS false}}
+{{$scrapeContainerd := DefaultParam .CL2_SCRAPE_CONTAINERD false}}
 {{$hostNetwork := DefaultParam .CL2_HOST_NETWORK "true"}}
 
 namespace:
@@ -68,6 +69,13 @@ steps:
         action: start
   {{end}}
 
+  {{if $scrapeContainerd}}
+  - module:
+      path: /containerd-measurements.yaml
+      params:
+        action: start
+  {{end}}
+
   {{range $j := Loop $steps}}
   - name: Create deployment {{$j}}
     phases:
@@ -143,6 +151,13 @@ steps:
         action: gather
   {{end}}
 
+  {{if $scrapeContainerd}}
+  - module:
+      path: /containerd-measurements.yaml
+      params:
+        action: gather
+  {{end}}
+
   {{range $j := Loop $steps}}
   - name: Deleting deployments {{$j}}
     phases:

@@ -0,0 +1,29 @@
+{{$action := .action}} # start, gather
+
+steps:
+  - name: {{$action}} Containerd Measurements
+    measurements:
+    - Identifier: ContainerdCriImagePullingThroughput
+      Method: GenericPrometheusQuery
+      Params:
+        action: {{$action}}
+        metricName: ContainerdCriImagePullingThroughput
+        metricVersion: v1
+        unit: MB/s
+        queries:
+        # Weighted average throughput per image pull (nodes with more pulls have more weight)
+        - name: Avg
+          query: sum(rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum(rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v]))
+        # Unweighted average - each node contributes equally regardless of pull count
+        - name: AvgPerNode
+          query: avg(sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])))
+        # Number of successful image pull observations
+        - name: Count
+          query: sum(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"})
+        # Cluster level percentiles - throughput distribution across nodes
+        - name: Perc50
+          query: quantile(0.5, sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])))
+        - name: Perc90
+          query: quantile(0.9, sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])))
+        - name: Perc99
+          query: quantile(0.99, sum by (instance) (rate(containerd_cri_image_pulling_throughput_sum{nodepool=~"userpool.*"}[%v])) / sum by (instance) (rate(containerd_cri_image_pulling_throughput_count{nodepool=~"userpool.*"}[%v])))
@@ -17,7 +17,7 @@
 def override_config_clusterloader2(
     node_count, node_per_step, max_pods, repeats, operation_timeout,
     load_type, scale_enabled, pod_startup_latency_threshold, provider,
-    registry_endpoint, os_type, scrape_kubelets, host_network, override_file):
+    registry_endpoint, os_type, scrape_kubelets, scrape_containerd, host_network, override_file):
     client = KubernetesClient(os.path.expanduser("~/.kube/config"))
     nodes = client.get_nodes(label_selector="cri-resource-consume=true")
     if len(nodes) == 0:
@@ -91,14 +91,19 @@ def override_config_clusterloader2(
         file.write(f"CL2_REGISTRY_ENDPOINT: {registry_endpoint}\n")
         file.write(f"CL2_OS_TYPE: {os_type}\n")
         file.write(f"CL2_SCRAPE_KUBELETS: {str(scrape_kubelets).lower()}\n")
+        file.write(f"CL2_SCRAPE_CONTAINERD: {str(scrape_containerd).lower()}\n")
+        if scrape_containerd:
+            file.write("CONTAINERD_SCRAPE_INTERVAL: 15s\n")
         file.write(f"CL2_HOST_NETWORK: {str(host_network).lower()}\n")
 
     file.close()
 
-def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider, scrape_kubelets):
+def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider, scrape_kubelets, scrape_containerd):
     run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, overrides=True, enable_prometheus=True,
-                    tear_down_prometheus=False, scrape_kubelets=scrape_kubelets)
+                    tear_down_prometheus=False, scrape_kubelets=scrape_kubelets, scrape_containerd=scrape_containerd)
 
+# Note: verify_measurement only checks kubelet metrics (accessible via node proxy endpoint).
+# Containerd metrics are only available via Prometheus and cannot be verified here.
 def verify_measurement():
     client = KubernetesClient(os.path.expanduser("~/.kube/config"))
     nodes = client.get_nodes(label_selector="cri-resource-consume=true")
@@ -266,6 +271,13 @@ def main():
         default=False,
         help="Whether to scrape kubelets",
     )
+    parser_override.add_argument(
+        "--scrape_containerd",
+        type=str2bool,
+        choices=[True, False],
+        default=False,
+        help="Whether to scrape containerd",
+    )
     parser_override.add_argument(
         "--host_network",
         type=str2bool,
@@ -302,6 +314,13 @@ def main():
         default=False,
         help="Whether to scrape kubelets",
     )
+    parser_execute.add_argument(
+        "--scrape_containerd",
+        type=str2bool,
+        choices=[True, False],
+        default=False,
+        help="Whether to scrape containerd",
+    )
 
     # Sub-command for collect_clusterloader2
     parser_collect = subparsers.add_parser(
@@ -366,6 +385,7 @@ def main():
             args.registry_endpoint,
             args.os_type,
             args.scrape_kubelets,
+            args.scrape_containerd,
             args.host_network,
             args.cl2_override_file,
         )
@@ -377,6 +397,7 @@ def main():
             args.kubeconfig,
             args.provider,
             args.scrape_kubelets,
+            args.scrape_containerd,
         )
     elif args.command == "collect":
         collect_clusterloader2(

@@ -64,6 +64,7 @@ def test_override_config_clusterloader2(self, mock_kubernetes_client, mock_open)
             os_type="linux",
             scrape_kubelets=True,
             host_network=True,
+            scrape_containerd=False,
             override_file="/mock/override.yaml"
         )
 
@@ -120,6 +121,7 @@ def test_override_config_clusterloader2_host_network_false(self, mock_kubernetes
             os_type="linux",
             scrape_kubelets=False,
             host_network=False,
+            scrape_containerd=False,
             override_file="/mock/override.yaml"
         )
 
@@ -138,13 +140,14 @@ def test_execute_clusterloader2(self, mock_run_cl2_command):
             cl2_report_dir="/mock/report",
             kubeconfig="/mock/kubeconfig",
             provider="aks",
-            scrape_kubelets=True
+            scrape_kubelets=True,
+            scrape_containerd=False
         )
 
         # Verify the command execution
         mock_run_cl2_command.assert_called_once_with(
             "/mock/kubeconfig", "mock-image", "/mock/config", "/mock/report", "aks",
-            overrides=True, enable_prometheus=True, tear_down_prometheus=False, scrape_kubelets=True
+            overrides=True, enable_prometheus=True, tear_down_prometheus=False, scrape_kubelets=True, scrape_containerd=False
         )
 
     @patch('clusterloader2.cri.cri.KubernetesClient')
@@ -235,12 +238,13 @@ def test_override_command(self, mock_override):
             "--os_type", "linux", 
             "--scrape_kubelets", "False", 
             "--host_network", "False",
+            "--scrape_containerd", "False",
             "--cl2_override_file", "/tmp/override.yaml"
         ]
         with patch.object(sys, 'argv', test_args):
             main()
             mock_override.assert_called_once_with(
-                5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "linux", False, False, "/tmp/override.yaml"
+                5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "linux", False, False, False, "/tmp/override.yaml"
             )
 
     @patch("clusterloader2.cri.cri.override_config_clusterloader2")
@@ -265,7 +269,7 @@ def test_override_command_default_host_network(self, mock_override):
         with patch.object(sys, 'argv', test_args):
             main()
             mock_override.assert_called_once_with(
-                5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "linux", False, True, "/tmp/override.yaml"
+                5, 1, 110, 3, "2m", "cpu", True, "10s", "aws", "test registry endpoint", "linux", False, False, True, "/tmp/override.yaml"
             )
 
     @patch("clusterloader2.cri.cri.execute_clusterloader2")
@@ -277,13 +281,14 @@ def test_execute_command(self, mock_execute):
             "--cl2_report_dir", "/reports",
             "--kubeconfig", "/home/user/.kube/config", 
             "--provider", "gcp", 
-            "--scrape_kubelets", "True"
+            "--scrape_kubelets", "True",
+            "--scrape_containerd", "False"
         ]
         with patch.object(sys, 'argv', test_args):
             main()
             mock_execute.assert_called_once_with(
                 "gcr.io/cl2:latest", "/configs", "/reports",
-                "/home/user/.kube/config", "gcp", True
+                "/home/user/.kube/config", "gcp", True, False
             )
 
     @patch("clusterloader2.cri.cri.collect_clusterloader2")

@@ -0,0 +1,41 @@
+trigger: none
+schedules:
+  - cron: "0 */4 * * *"
+    displayName: "Every 4 Hour"
+    branches:
+      include:
+        - main
+    always: true
+
+variables:
+  SCENARIO_TYPE: perf-eval
+  SCENARIO_NAME: image-pull-n10
+
+stages:
+  - stage: azure_eastus2_image_pull
+    dependsOn: []
+    jobs:
+      - template: /jobs/competitive-test.yml
+        parameters:
+          cloud: azure
+          regions:
+            - eastus2
+          engine: clusterloader2
+          engine_input:
+            image: "ghcr.io/azure/clusterloader2:v20250513"
+          topology: cri-resource-consume
+          matrix:
+            image-pull-10pods:
+              node_count: 10
+              max_pods: 30
+              repeats: 1
+              operation_timeout: 3m
+              load_type: memory
+              scrape_containerd: True
+              scrape_kubelets: True
+              kubernetes_version: "1.34"
+              pod_startup_latency_threshold: 20s
+          max_parallel: 1
+          credential_type: service_connection
+          ssh_key_enabled: false
+          timeout_in_minutes: 60
@@ -0,0 +1,81 @@
+# image-pull-n10
+
+## Overview
+
+Measures containerd image pulling throughput (MB/s) and network plugin operation metrics using the CRI module with `scrape_containerd: True`. Uses the `cri-resource-consume` topology.
+
+## Infrastructure
+
+| Component | Configuration |
+|-----------|---------------|
+| Cloud Provider | Azure |
+| Cluster SKU | Standard |
+| Network Plugin | Azure CNI Overlay |
+| Default Node Pool | 3 x Standard_D4s_v3 |
+| Prometheus Pool | 1 x Standard_D8s_v3 |
+| User Pool | 10 x Standard_D4s_v3 |
+
+## Test Workload
+
+| Component | Value |
+|-----------|-------|
+| Registry | Azure Container Registry (`akscritelescope.azurecr.io`) |
+| Image | `e2e-test-images/resource-consumer:1.13` |
+| Image Size | ~50MB |
+
+## Metrics Collected
+
+### ContainerdCriImagePullingThroughput
+
+Image pull throughput (MB/s) with the following aggregations:
+
+| Metric | Description |
+|--------|-------------|
+| **Avg** | Weighted average throughput per image pull |
+| **AvgPerNode** | Unweighted average - each node contributes equally |
+| **Count** | Total number of image pulls |
+| **Perc50** | 50th percentile (median) throughput across nodes |
+| **Perc90** | 90th percentile throughput across nodes |
+| **Perc99** | 99th percentile throughput across nodes |
+
+## Known Limitations
+
+### Cannot Use histogram_quantile() Per Node
+
+Using Prometheus `histogram_quantile()` on per-node throughput data always returns `10` (the maximum bucket boundary) regardless of actual throughput values. This happens because:
+
+- The histogram has fixed bucket boundaries: `0.5, 1, 2, 4, 6, 8, 10` MB/s
+- When actual throughput exceeds 10 MB/s, all samples fall into the `+Inf` bucket
+- `histogram_quantile()` can only interpolate within defined buckets, so it caps at `10`
+
+**Current Approach**: Instead of `histogram_quantile()` per node, we use weighted average (`_sum / _count`) per node, then compute percentiles across the node averages.
+
+### Per-Node Metrics May Return "no samples"
+
+The per-node metrics (`AvgPerNode`, `Perc50`, `Perc90`, `Perc99`) may return "no samples" while aggregate metrics (`Avg`, `Count`) work correctly. This is caused by Prometheus `rate()` function requiring **at least 2 data points** within the query window.
+
+**Root Cause**: If image pulls complete faster than the Prometheus scrape interval (default 15s), only one data point is collected per pull operation. The `rate()` function cannot compute a rate from a single sample, resulting in empty per-node results.
+
+**Why Aggregate Metrics Work**: `Avg` and `Count` use `sum()` which aggregates samples across all pods/nodes before applying `rate()`, accumulating enough data points within the window.
+
+**Workaround Options**:
+- Increase scrape frequency (may impact cluster performance)
+- Use larger images that take longer to pull
+- Rely on aggregate metrics (`Avg`, `Count`) for throughput analysis
+
+### Metric Includes Unpack Time
+
+The `containerd_cri_image_pulling_throughput` metric measures **total image size divided by total pull time**, which includes both:
+- Image layer download time
+- Image layer decompression/unpack time
+
+This is not a pure network throughput metric. See [containerd source](https://github.com/containerd/containerd/blob/main/internal/cri/server/images/image_pull.go).
+
+### verify_measurement() Cannot Check Containerd Metrics
+
+The CRI module's `verify_measurement()` function only validates kubelet metrics (accessible via Kubernetes node proxy endpoint at `/api/v1/nodes/{node}/proxy/metrics`). Containerd metrics are only available through the Prometheus server and cannot be verified through this endpoint.
+
+## References
+
+- [Best Practices](../../../docs/best-practices.md)
+- [Test Scenario Implementation Guide](../../../docs/test-scenario-implementation-guide.md)