diff --git a/kcl/ccp_team/hyperscale_pod_scheduling_h8/cl2.yaml b/kcl/ccp_team/hyperscale_pod_scheduling_h8/cl2.yaml new file mode 100644 index 0000000000..6e17b31e51 --- /dev/null +++ b/kcl/ccp_team/hyperscale_pod_scheduling_h8/cl2.yaml @@ -0,0 +1,216 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: cl2-override + namespace: clusterloader2 +data: + override.yaml: | + Nodes: 8000 + CL2_SCHEDULER_THROUGHPUT_TOTAL_PODS: 800000 + CL2_SCHEDULER_THROUGHPUT_PODS_PER_DEPLOYMENT: 1000 # Setting a high number of pods per deployment to reduce the number of API calls and focus on scheduler performance rather than API server performance. + CL2_DEFAULT_QPS: 2 # Setting low QPS to avoid overwhelming the API server and causing timeouts, since this test is focused on scheduler throughput, not API server performance. + CL2_RUN_ON_ARM_NODES: true # This is a hack to allow Cl2 to run on Kwok nodes +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: cl2 + namespace: clusterloader2 +spec: + completions: 1 # # Run a single instance of the job since we are measuring scheduler throughput and not apiserver performance + parallelism: 1 # Run a single instance of the job since we are measuring scheduler throughput and not apiserver performance + backoffLimit: 0 # Don't retry failed CL2 runs. + template: + spec: + containers: + - args: + - '--provider=aks' + - '--run-from-cluster=true' + - '--v=2' + - '--testoverrides=/override/override.yaml' + - '--testconfig=testing/load/cl2-config.yaml' + image: ghcr.io/azure/clusterloader2:v20260220 + name: cl2 + resources: + requests: + cpu: '6' + memory: '24Gi' + volumeMounts: + - mountPath: /override + name: cl2-override + - mountPath: /root/perf-tests/clusterloader2/testing/load/cl2-config.yaml + name: cl2-config + subPath: config.yaml + - mountPath: /root/perf-tests/clusterloader2/testing/load/modules/scheduler-throughput.yaml + name: cl2-scheduler-throughput + subPath: scheduler-throughput.yaml + nodeSelector: + agentpool: cl2pool + restartPolicy: Never + serviceAccountName: cl2 + tolerations: + - effect: NoSchedule + key: cl2pool + operator: Exists + volumes: + - configMap: + name: cl2-override + name: cl2-override + - configMap: + name: cl2-config + name: cl2-config + - configMap: + name: cl2-scheduler-throughput + name: cl2-scheduler-throughput +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cl2-scheduler-throughput + namespace: clusterloader2 +data: + scheduler-throughput.yaml: | + ## Input params + # Valid actions: "create", "delete" + {{$action := .action}} + {{$namespaces := .namespaces}} + {{$replicasPerNamespace := .replicasPerNamespace}} + {{$schedulerThroughputNamespaces := .schedulerThroughputNamespaces}} + {{$schedulerThroughputPodsPerDeployment := .schedulerThroughputPodsPerDeployment}} + ## Derivative variables + {{$is_creating := (eq .action "create")}} + ## CL2 params + {{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam .CL2_SCHEDULER_THROUGHPUT_THRESHOLD 100}} + {{$CHECK_IF_PODS_ARE_UPDATED := DefaultParam .CL2_CHECK_IF_PODS_ARE_UPDATED true}} + {{$deploymentImage := DefaultParam .deploymentImage "registry.k8s.io/pause:3.9"}} + + steps: + {{if $is_creating}} + - name: Creating scheduler throughput measurements + measurements: + - Identifier: WaitForSchedulerThroughputDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + checkIfPodsAreUpdated: {{$CHECK_IF_PODS_ARE_UPDATED}} + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = scheduler-throughput + operationTimeout: 5h + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: start + labelSelector: group = scheduler-throughput + measurmentInterval: 1s + {{end}} + - name: {{$action}} scheduler throughput pods + phases: + - namespaceRange: + min: {{AddInt $namespaces 1}} + max: {{AddInt $namespaces $schedulerThroughputNamespaces}} + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: default + objectBundle: + - basename: scheduler-throughput-deployment + objectTemplatePath: simple-deployment.yaml + templateFillMap: + Replicas: {{$schedulerThroughputPodsPerDeployment}} + Group: scheduler-throughput + Image: {{$deploymentImage}} + - name: Waiting for scheduler throughput pods to be {{$action}}d + measurements: + - Identifier: WaitForSchedulerThroughputDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{if $is_creating}} + - name: Collecting scheduler throughput measurements + measurements: + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + enableViolations: true + threshold: {{$SCHEDULER_THROUGHPUT_THRESHOLD}} + {{end}} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cl2 + namespace: clusterloader2 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cl2 +rules: +- apiGroups: ["*"] + resources: ["*"] + verbs: ["*"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cl2 +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cl2 +subjects: +- kind: ServiceAccount + name: cl2 + namespace: clusterloader2 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cl2-config + namespace: clusterloader2 +data: + config.yaml: | + # Minimal config that runs only the scheduler-throughput module. + # Use overrides.yaml or --override flags to supply required parameters. + + # BEGIN scheduler-throughput section + + {{$totalSchedulerThroughputPods := DefaultParam .CL2_SCHEDULER_THROUGHPUT_TOTAL_PODS (MaxInt 1000 .Nodes)}} + {{$schedulerThroughputPodsPerDeployment := DefaultParam .CL2_SCHEDULER_THROUGHPUT_PODS_PER_DEPLOYMENT $totalSchedulerThroughputPods}} + {{$schedulerThroughputNamespaces := DivideInt $totalSchedulerThroughputPods $schedulerThroughputPodsPerDeployment}} + # END scheduler-throughput section + + {{$defaultQps := DefaultParam .CL2_DEFAULT_QPS (IfThenElse (le .Nodes 500) 10 100)}} + {{$registry := DefaultParam .CL2_LATENCY_POD_REGISTRY "registry.k8s.io"}} + {{$latencyPodImage := DefaultParam .CL2_LATENCY_POD_IMAGE (Concat $registry "/pause:3.9")}} + + name: scheduler-throughput-only + namespace: + number: {{$schedulerThroughputNamespaces}} + tuningSets: + - name: default + globalQPSLoad: + qps: {{$defaultQps}} + burst: 1 + + steps: + # BEGIN scheduler throughput + - module: + path: modules/scheduler-throughput.yaml + params: + action: create + namespaces: 0 + replicasPerNamespace: 1 + schedulerThroughputNamespaces: {{$schedulerThroughputNamespaces}} + schedulerThroughputPodsPerDeployment: {{$schedulerThroughputPodsPerDeployment}} + deploymentImage: {{$latencyPodImage}} + + - module: + path: modules/scheduler-throughput.yaml + params: + action: delete + namespaces: 0 + replicasPerNamespace: 0 + schedulerThroughputNamespaces: {{$schedulerThroughputNamespaces}} + schedulerThroughputPodsPerDeployment: {{$schedulerThroughputPodsPerDeployment}} + # END scheduler throughput diff --git a/kcl/ccp_team/hyperscale_pod_scheduling_h8/kwok-node.yaml b/kcl/ccp_team/hyperscale_pod_scheduling_h8/kwok-node.yaml new file mode 100644 index 0000000000..115e4c83c9 --- /dev/null +++ b/kcl/ccp_team/hyperscale_pod_scheduling_h8/kwok-node.yaml @@ -0,0 +1,56 @@ +apiVersion: v1 +kind: Node +metadata: + name: {{node_name}} + annotations: + node.alpha.kubernetes.io/ttl: "0" + kwok.x-k8s.io/node: fake + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + kubernetes.io/arch: amd64 + kubernetes.io/hostname: {{node_name}} + kubernetes.io/os: linux + kubernetes.io/role: agent + node-role.kubernetes.io/agent: "" + kwok-controller-group: "{{controller_group}}" + kwok.x-k8s.io/node: "fake" + type: kwok +spec: + providerID: "kwok://{{node_name}}" + unschedulable: false + taints: # Avoid scheduling actual running pods to fake Node + - effect: NoSchedule + key: kubernetes.io/arch + value: arm64 # This is a hack to allow Cl2 pods to run on Kwok nodes. +status: + addresses: + - type: InternalIP + address: {{node_ip}} + allocatable: + cpu: {{node_cpu}} + memory: {{node_memory}} + pods: {{node_pods}} + nvidia.com/gpu: {{node_gpu}} + capacity: + cpu: {{node_cpu}} + memory: {{node_memory}} + pods: {{node_pods}} + nvidia.com/gpu: {{node_gpu}} + conditions: + - type: "Ready" + status: "True" + reason: "KubeletReady" + message: "kubelet is posting ready status" + nodeInfo: + architecture: amd64 + bootID: "" + containerRuntimeVersion: "" + kernelVersion: "" + kubeProxyVersion: fake + kubeletVersion: fake + machineID: "" + operatingSystem: linux + osImage: "" + systemUUID: "" + phase: Running diff --git a/kcl/ccp_team/hyperscale_pod_scheduling_h8/pipeline.k b/kcl/ccp_team/hyperscale_pod_scheduling_h8/pipeline.k new file mode 100644 index 0000000000..1896059a02 --- /dev/null +++ b/kcl/ccp_team/hyperscale_pod_scheduling_h8/pipeline.k @@ -0,0 +1,174 @@ +import azure_pipelines.ap +import azure_pipelines.ap.jobs.job +import lib.const +import lib.steps.azure +import lib.steps.common +import lib.steps.k8s +import lib.util + +SUBSCRIPTION_ID = "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" +RESOURCE_GROUP = "$(RUN_ID)" +LOCATION = "southeastasia" +CLUSTER = "stg-H2-hyperscale-pod-scheduling-rate" +NODE_COUNT = 8000 +KWOK_NODES_PER_CONTROLLER = 100 +KWOK_POOL = "kwokpool" +KWOK_POOL_VM_SIZE = "Standard_D4_v3" +KWOK_POOL_COUNT = 50 +CL2_IMAGE = "ghcr.io/azure/clusterloader2:v20260220" +CL2_NAMESPACE = "clusterloader2" +CL2_POOL = "cl2pool" +CL2_TAINT_PREFIX = "cl2pool" +requestBody = util.escapeStr(""" +{ + "location": "${LOCATION}", + "identity": { "type": "SystemAssigned" }, + "sku": { + "name": "Base", + "tier": "Standard" + }, + "tags": { + "SkipAKSCluster": "true", + "SkipASMAzSecPackAutoConfig": "true", + "SkipLinuxAzSecPack": "true" + }, + "properties": { + "controlPlaneScalingProfile": {"scalingSize": "H8"}, + "kubernetesVersion": "1.33.0", + "dnsPrefix": "${CLUSTER}-dns", + "agentPoolProfiles": [ + { + "name": "systempool", + "mode": "System", + "count": 3, + "vmSize": "Standard_D8S_v4", + "osType": "Linux", + "maxPods": 250 + } + ], + "networkProfile": { + "networkPlugin": "azure", + "networkPluginMode": "overlay", + "podCidr": "10.64.0.0/10", + "serviceCidr": "10.0.0.0/16", + "dnsServiceIP": "10.0.0.10", + "outboundType": "managedNATGateway", + "natGatewayProfile": { + "managedOutboundIPProfile": { + "count": 10 + } + } + } + } + }""") + +createClusterScript = """ +az rest \\ + --method put \\ + --uri "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.ContainerService/managedClusters/${CLUSTER}?api-version=2026-01-02-preview" \\ + --body "${requestBody}" +""" + +output = ap.Pipeline { + name = "Hyperscale Pod Scheduling Rate" + + trigger = ["v2"] + pool = const.DEFAULT_POOL + + parameters = [ + ap.Parameter { + name = "run_id" + displayName = "Run ID (leave empty to auto-generate)" + type = "string" + default = "default" + } + ] + + jobs = [ + job.Job { + job = "benchmarking" + displayName = "Benchmarking" + timeoutInMinutes = 1440 + + steps = [ + common.SetRunId(), + common.InstallPythonDependencies(), + azure.Login( + const.DEFAULT_SERVICE_CONNECTION, + SUBSCRIPTION_ID, + LOCATION + ), + azure.CreateResourceGroup( + const.DEFAULT_SERVICE_CONNECTION, + RESOURCE_GROUP, + LOCATION, + SUBSCRIPTION_ID + ), + azure.AzCli( + const.DEFAULT_SERVICE_CONNECTION, + "Create cluster", + createClusterScript), + azure.WaitForClusterSucceeded( + const.DEFAULT_SERVICE_CONNECTION, + CLUSTER, + RESOURCE_GROUP, + SUBSCRIPTION_ID), + azure.CreateNodePool( + const.DEFAULT_SERVICE_CONNECTION, + CLUSTER, + RESOURCE_GROUP, + SUBSCRIPTION_ID, + CL2_POOL, + "Standard_D8S_v4", + 4, + CL2_TAINT_PREFIX), + azure.WaitForNodePoolSucceeded( + const.DEFAULT_SERVICE_CONNECTION, + CLUSTER, + RESOURCE_GROUP, + SUBSCRIPTION_ID, + CL2_POOL), + azure.GetCredentials( + const.DEFAULT_SERVICE_CONNECTION, + CLUSTER, + RESOURCE_GROUP, + SUBSCRIPTION_ID), + azure.CreateNodePool( + const.DEFAULT_SERVICE_CONNECTION, + CLUSTER, + RESOURCE_GROUP, + SUBSCRIPTION_ID, + KWOK_POOL, + KWOK_POOL_VM_SIZE, + KWOK_POOL_COUNT, + "kwok", + "kwok=true"), + azure.WaitForNodePoolSucceeded( + const.DEFAULT_SERVICE_CONNECTION, + CLUSTER, + RESOURCE_GROUP, + SUBSCRIPTION_ID, + KWOK_POOL), + k8s.CreateKwokNodes( + NODE_COUNT, + { + "node-manifest-path": "$(Pipeline.Workspace)/s/kcl/ccp_team/hyperscale_pod_scheduling/kwok-node.yaml" + "nodes-per-controller": "${KWOK_NODES_PER_CONTROLLER}" + "node-selector": "kwok=true" + "node-lease-duration-seconds": "100" + }), + k8s.RunClusterLoader2( + const.DEFAULT_SERVICE_CONNECTION, + CL2_NAMESPACE, + manifest = "kcl/ccp_team/hyperscale_pod_scheduling/cl2.yaml"), + k8s.PrintCl2PodLogs( + const.DEFAULT_SERVICE_CONNECTION, + CL2_NAMESPACE), + azure.DeleteResourceGroup( + const.DEFAULT_SERVICE_CONNECTION, + RESOURCE_GROUP, + SUBSCRIPTION_ID) + ] + } + ] +} diff --git a/kcl/ccp_team/hyperscale_pod_scheduling_h8/pipeline.yaml b/kcl/ccp_team/hyperscale_pod_scheduling_h8/pipeline.yaml new file mode 100755 index 0000000000..431c0a9743 --- /dev/null +++ b/kcl/ccp_team/hyperscale_pod_scheduling_h8/pipeline.yaml @@ -0,0 +1,306 @@ +name: Hyperscale Pod Scheduling Rate +pool: AKS-Telescope-Airlock +trigger: +- v2 +parameters: +- name: run_id + displayName: Run ID (leave empty to auto-generate) + type: string + default: default +jobs: +- job: benchmarking + displayName: Benchmarking + timeoutInMinutes: 1440 + steps: + - bash: |2 + + set -exo pipefail + if [ -n "$RUN_ID" ] && [ "$RUN_ID" != "default" ]; then + echo "Using provided Run ID: $RUN_ID" + else + job_id="$(System.JobId)" + RUN_ID=$(Build.BuildId)-${job_id:0:8} + echo "Run ID: $RUN_ID" + fi + echo "##vso[task.setvariable variable=RUN_ID]$RUN_ID" + displayName: Set Run ID + env: + RUN_ID: ${{ parameters.run_id }} + - bash: |- + set -exo pipefail + pip3 install --upgrade "pip<24" + pip3 install -r $(Pipeline.Workspace)/s/modules/python/requirements.txt + displayName: Install Python dependencies + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az account set --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" + az config set defaults.location="southeastasia" + az account show + displayName: Login to Azure + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az group create --name "$(RUN_ID)" --location "southeastasia" --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" + displayName: Create resource group $(RUN_ID) in southeastasia (b8ceb4e5-f05b-4562-a9f5-14acb1f24219) + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az rest \ + --method put \ + --uri "/subscriptions/b8ceb4e5-f05b-4562-a9f5-14acb1f24219/resourceGroups/$(RUN_ID)/providers/Microsoft.ContainerService/managedClusters/stg-H2-hyperscale-pod-scheduling-rate?api-version=2026-01-02-preview" \ + --body " + { + \"location\": \"southeastasia\", + \"identity\": { \"type\": \"SystemAssigned\" }, + \"sku\": { + \"name\": \"Base\", + \"tier\": \"Standard\" + }, + \"tags\": { + \"SkipAKSCluster\": \"true\", + \"SkipASMAzSecPackAutoConfig\": \"true\", + \"SkipLinuxAzSecPack\": \"true\" + }, + \"properties\": { + \"controlPlaneScalingProfile\": {\"scalingSize\": \"H8\"}, + \"kubernetesVersion\": \"1.33.0\", + \"dnsPrefix\": \"stg-H2-hyperscale-pod-scheduling-rate-dns\", + \"agentPoolProfiles\": [ + { + \"name\": \"systempool\", + \"mode\": \"System\", + \"count\": 3, + \"vmSize\": \"Standard_D8S_v4\", + \"osType\": \"Linux\", + \"maxPods\": 250 + } + ], + \"networkProfile\": { + \"networkPlugin\": \"azure\", + \"networkPluginMode\": \"overlay\", + \"podCidr\": \"10.64.0.0/10\", + \"serviceCidr\": \"10.0.0.0/16\", + \"dnsServiceIP\": \"10.0.0.10\", + \"outboundType\": \"managedNATGateway\", + \"natGatewayProfile\": { + \"managedOutboundIPProfile\": { + \"count\": 10 + } + } + } + } + }" + displayName: Create cluster + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + while true; do + STATE=$(az aks show \ + --name "stg-H2-hyperscale-pod-scheduling-rate" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --query "provisioningState" \ + --output tsv) + echo "Cluster provisioning state: $STATE" + if [ "$STATE" = "Succeeded" ]; then + echo "Cluster is ready." + break + elif [ "$STATE" = "Failed" ] || [ "$STATE" = "Canceled" ]; then + echo "Cluster failed with state: $STATE." + exit 1 + else + echo "Provisioning state: $STATE. Retry in 30 seconds" + fi + sleep 30 + done + displayName: Wait for cluster to succeed + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: "set -exo pipefail\n\naz aks nodepool add \\\n --cluster-name \"stg-H2-hyperscale-pod-scheduling-rate\" \\\n --resource-group \"$(RUN_ID)\" \\\n --subscription \"b8ceb4e5-f05b-4562-a9f5-14acb1f24219\" \\\n --name \"cl2pool\" \\\n --node-count 4 \\\n --node-vm-size \"Standard_D8S_v4\" \\\n --mode User \\\n --node-taints cl2pool=true:NoSchedule \\\n \n" + displayName: Create node pool cl2pool + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + while true; do + STATE=$(az aks nodepool show \ + --cluster-name "stg-H2-hyperscale-pod-scheduling-rate" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --name "cl2pool" \ + --query "provisioningState" \ + --output tsv) + echo "Node pool provisioning state: $STATE" + if [ "$STATE" = "Succeeded" ]; then + echo "Node pool is ready." + break + elif [ "$STATE" = "Failed" ] || [ "$STATE" = "Canceled" ]; then + echo "Node pool failed with state: $STATE." + exit 1 + else + echo "Provisioning state: $STATE. Retry in 30 seconds" + fi + sleep 30 + done + displayName: Wait for node pool cl2pool to succeed + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az aks get-credentials \ + --name "stg-H2-hyperscale-pod-scheduling-rate" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --overwrite-existing + displayName: Get credentials for stg-H2-hyperscale-pod-scheduling-rate + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az aks nodepool add \ + --cluster-name "stg-H2-hyperscale-pod-scheduling-rate" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --name "kwokpool" \ + --node-count 50 \ + --node-vm-size "Standard_D4_v3" \ + --mode User \ + --node-taints kwok=true:NoSchedule \ + --labels "kwok=true" + displayName: Create node pool kwokpool + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + while true; do + STATE=$(az aks nodepool show \ + --cluster-name "stg-H2-hyperscale-pod-scheduling-rate" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --name "kwokpool" \ + --query "provisioningState" \ + --output tsv) + echo "Node pool provisioning state: $STATE" + if [ "$STATE" = "Succeeded" ]; then + echo "Node pool is ready." + break + elif [ "$STATE" = "Failed" ] || [ "$STATE" = "Canceled" ]; then + echo "Node pool failed with state: $STATE." + exit 1 + else + echo "Provisioning state: $STATE. Retry in 30 seconds" + fi + sleep 30 + done + displayName: Wait for node pool kwokpool to succeed + - script: |2- + + set -exo pipefail + export PYTHONPATH=$PYTHONPATH:$(pwd) + python3 kwok/kwok.py --action create --node-count 8000 --node-manifest-path $(Pipeline.Workspace)/s/kcl/ccp_team/hyperscale_pod_scheduling/kwok-node.yaml --nodes-per-controller 100 --node-selector kwok=true --node-lease-duration-seconds 100 + python3 kwok/kwok.py --action validate --node-count 8000 --node-manifest-path $(Pipeline.Workspace)/s/kcl/ccp_team/hyperscale_pod_scheduling/kwok-node.yaml --nodes-per-controller 100 --node-selector kwok=true --node-lease-duration-seconds 100 + workingDirectory: modules/python + displayName: Create and Validate KWOK Nodes + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + kubectl create namespace "clusterloader2" || true + kubectl apply -f $(Pipeline.Workspace)/s/kcl/ccp_team/hyperscale_pod_scheduling/cl2.yaml + # Wait + while true; do + phases="$(kubectl get pods --namespace="clusterloader2" -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}')" + total=0 + terminal=0 + + if [ -n "$phases" ]; then + while IFS= read -r phase; do + if [ -z "$phase" ]; then + continue + fi + + total=$((total + 1)) + if [ "$phase" = "Succeeded" ] || [ "$phase" = "Failed" ]; then + terminal=$((terminal + 1)) + fi + done <<< "$phases" + fi + + if [ "$total" -gt 0 ] && [ "$terminal" -eq "$total" ]; then + echo "All cl2 pods reached terminal state (Succeeded or Failed)" + kubectl get pods --namespace="clusterloader2" -l job-name=cl2 -o wide + + break + fi + + sleep 30 + done + displayName: Run cluster loader 2 + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + for pod in $(kubectl get pods --namespace="clusterloader2" -l job-name=cl2 -o jsonpath='{.items[*].metadata.name}'); do + echo "===== Logs for pod: $pod =====" + kubectl logs --namespace="clusterloader2" "$pod" || true + done + displayName: Print cl2 pod logs + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az group delete --name "$(RUN_ID)" --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" --yes + condition: always() + displayName: Delete resource group $(RUN_ID) (b8ceb4e5-f05b-4562-a9f5-14acb1f24219)