Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 152 additions & 90 deletions modules/python/clusterloader2/autoscale/autoscale.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
{{$deploymentTemplatePath := DefaultParam .CL2_DEPLOYMENT_TEMPLATE_PATH "deployment_template.yaml"}}
{{$deploymentSize := DefaultParam .CL2_DEPLOYMENT_SIZE 100}}
{{$deploymentCpu := DefaultParam .CL2_DEPLOYMENT_CPU "346m"}}
{{$deploymentMemory := DefaultParam .CL2_DEPLOYMENT_MEMORY "100Mi"}}
{{$nodeSelector := DefaultParam .CL2_NODE_SELECTOR "{karpenter.sh/nodepool: default}"}}
{{$podLabelSelector := DefaultParam .CL2_POD_LABEL_SELECTOR "app = inflate"}}
{{$scaleUpTimeout := DefaultParam .CL2_SCALE_UP_TIMEOUT "30m"}}
{{$scaleDownTimeout := DefaultParam .CL2_SCALE_DOWN_TIMEOUT "10m"}}
{{$refreshInterval := DefaultParam .CL2_REFRESH_INTERVAL "5s"}}
{{$loopCount := DefaultParam .CL2_LOOP_COUNT 1}}
{{$coolDownTime := DefaultParam .CL2_COOLDOWN_TIME "120s"}}
{{$osType := DefaultParam .CL2_OS_TYPE "linux"}}
{{$countErrorMargin := MultiplyInt .CL2_DEPLOYMENT_SIZE 0.01}}

name: autoscale
namespace:
number: 1
prefix: autoscale
deleteStaleNamespaces: true
deleteAutomanagedNamespaces: true
enableExistingNamespaces: true

tuningSets:
- name: Uniform1qps
qpsLoad:
qps: 20

steps:
{{range $i := Loop $loopCount}}
- name: Start Measurements {{$i}}
measurements:
- Identifier: ResourceUsageSummary
Method: ResourceUsageSummary
Params:
action: start
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: {{$podLabelSelector}}
threshold: {{$scaleUpTimeout}}
- Identifier: SchedulingThroughput
Method: SchedulingThroughput
Params:
action: start
labelSelector: {{$podLabelSelector}}
- name: Create deployment {{$i}}
phases:
- namespaceRange:
min: 1
max: 1
replicasPerNamespace: 1
tuningSet: Uniform1qps
objectBundle:
- basename: inflate
objectTemplatePath: {{$deploymentTemplatePath}}
templateFillMap:
Replicas: {{$deploymentSize}}
CPUperJob: {{$deploymentCpu}}
MemoryRequest: {{$deploymentMemory}}
NodeSelector: {{ (StructuralData $nodeSelector) }}
OSType: {{$osType}}
- name: Measure nodes and pods scale up {{$i}}
measurements:
- Identifier: WaitForRunningPodsUp {{$i}}
Method: WaitForRunningPods
Params:
action: start
desiredPodCount: {{$deploymentSize}}
countErrorMargin: {{$countErrorMargin}}
labelSelector: {{$podLabelSelector}}
timeout: {{$scaleUpTimeout}}
refreshInterval: {{$refreshInterval}}
- name: Capture Metrics After Scale Up {{$i}}
measurements:
- Identifier: ResourceMetrics{{$i}}
Method: GenericPrometheusQuery
Params:
action: start
metricName: Resource Metrics Summary
metricVersion: v1
unit: mixed
queries:
# Node Level Summary
- name: TotalNodes
query: count(kube_node_status_allocatable{resource="cpu"})
- name: NodeCPUAllocatable
query: sum(kube_node_status_allocatable{resource="cpu"})
- name: NodeMemoryAllocatable
query: sum(kube_node_status_allocatable{resource="memory"})
# Node CPU Usage Stats (from kubelet/cAdvisor - container metrics aggregated by node)
- name: NodeCPUUsageAvg
query: avg(sum by (instance) (rate(container_cpu_usage_seconds_total{id="/"}[2m])))
- name: NodeCPUUsageMax
query: max(sum by (instance) (rate(container_cpu_usage_seconds_total{id="/"}[2m])))
- name: NodeCPUUsageMin
query: min(sum by (instance) (rate(container_cpu_usage_seconds_total{id="/"}[2m])))
# Node Memory Usage Stats (from kubelet/cAdvisor - container metrics aggregated by node)
- name: NodeMemoryUsageAvg
query: avg(sum by (instance) (container_memory_working_set_bytes{id="/"}))
- name: NodeMemoryUsageMax
query: max(sum by (instance) (container_memory_working_set_bytes{id="/"}))
- name: NodeMemoryUsageMin
query: min(sum by (instance) (container_memory_working_set_bytes{id="/"}))
# Pod Level Summary
- name: TotalPods
query: count(kube_pod_status_phase{phase="Running"})
# Pod Distribution Summary
- name: PodsPerNodeAvg
query: avg(count by (node) (kube_pod_info{node!=""}))
- name: PodsPerNodeMax
query: max(count by (node) (kube_pod_info{node!=""}))
- name: PodsPerNodeMin
query: min(count by (node) (kube_pod_info{node!=""}))
- name: Gather Measurements {{$i}}
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: gather
- Identifier: SchedulingThroughput
Method: SchedulingThroughput
Params:
action: gather
- Identifier: ResourceUsageSummary
Method: ResourceUsageSummary
Params:
action: gather
- Identifier: ResourceMetrics{{$i}}
Method: GenericPrometheusQuery
Params:
action: gather
- name: WaitBeforeDelete
measurements:
- Identifier: WaitBeforeDelete
Method: Sleep
Params:
action: start
duration: {{$coolDownTime}}
- name: Delete deployment {{$i}}
phases:
- namespaceRange:
min: 1
max: 1
replicasPerNamespace: 0
tuningSet: Uniform1qps
objectBundle:
- basename: inflate
objectTemplatePath: {{$deploymentTemplatePath}}
templateFillMap:
Replicas: {{$deploymentSize}}
CPUperJob: {{$deploymentCpu}}
MemoryRequest: {{$deploymentMemory}}
OSType: {{$osType}}
- name: Measure nodes and pods scale down {{$i}}
measurements:
- Identifier: WaitForRunningPodsDown {{$i}}
Method: WaitForRunningPods
Params:
action: start
desiredPodCount: 0
labelSelector: {{$podLabelSelector}}
timeout: {{$scaleDownTimeout}}
refreshInterval: {{$refreshInterval}}
{{end}}
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ stages:
topology: karpenter
matrix:
complex-nap:
cpu_per_node: 2
node_count: 5
pod_count: 5
pod_count: 5000
pod_cpu_request: 16
pod_memory_request: "60Gi"
scale_up_timeout: "15m"
scale_down_timeout: "15m"
node_label_selector: "karpenter.sh/nodepool = default"
node_selector: "{karpenter.sh/nodepool: default}"
loop_count: 1
warmup_deployment: true
warmup_deployment_template: warmup_deployment.yaml
vm_size: Standard_D2s_v4
capacity_type: on-demand
cl2_config_file: "ms_complex_config.yaml"
karpenter_nodepool_file: "karpenter_complex_nodepool.azure.yml"
max_parallel: 1
timeout_in_minutes: 60
timeout_in_minutes: 120
credential_type: service_connection
ssh_key_enabled: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Shared AKSNodeClass (common for both Spot and On-Demand)
---
apiVersion: karpenter.azure.com/v1alpha2
kind: AKSNodeClass
metadata:
name: default
annotations:
kubernetes.io/description: "General purpose AKSNodeClass for running Ubuntu2204 nodes"
spec:
imageFamily: Ubuntu2204

# On-Demand NodePool (default)
---
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: default
annotations:
kubernetes.io/description: "General purpose On-Demand NodePool"
spec:
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 1m
budgets:
- nodes: "100%"
template:
spec:
nodeClassRef:
group: karpenter.azure.com
kind: AKSNodeClass
name: default
expireAfter: Never
requirements:
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: karpenter.azure.com/sku-name
operator: In
values:
- "Standard_D96ds_v5" # 55k DDSv5
- "Standard_D96d_v5"
- "Standard_D96_v5" # 100k Dv5
- "Standard_D96s_v5"
- key: topology.kubernetes.io/zone
operator: In
values:
- eastus2-1
- eastus2-2
- eastus2-3

# Spot NodePool
---
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: spot
annotations:
kubernetes.io/description: "Spot NodePool for burstable cost-efficient workloads"
spec:
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 1s
budgets:
- nodes: "100%"
template:
spec:
nodeClassRef:
group: karpenter.azure.com
kind: AKSNodeClass
name: default
expireAfter: Never
requirements:
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["spot"]
- key: karpenter.azure.com/sku-name
operator: In
values: [Standard_D2_v5]
# system-surge NodePool
---
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: system-surge
annotations:
kubernetes.io/description: "Surge capacity pool for system pod pressure"
spec:
disruption:
budgets:
- nodes: "1"
consolidateAfter: 1m
consolidationPolicy: WhenEmpty
template:
metadata:
labels:
kubernetes.azure.com/ebpf-dataplane: "cilium"
kubernetes.azure.com/mode: "system"
spec:
expireAfter: Never
nodeClassRef:
group: karpenter.azure.com
kind: AKSNodeClass
name: default
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: karpenter.azure.com/sku-name
operator: In
values:
- Standard_D16_v3
- key: topology.kubernetes.io/zone
operator: In
values:
- eastus2-1
- eastus2-2
- eastus2-3
startupTaints:
- effect: NoExecute
key: node.cilium.io/agent-not-ready
value: "true"
taints:
- effect: NoSchedule
key: CriticalAddonsOnly
value: "true"
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ public_ip_config_list = [
}
]


network_config_list = [
{
role = "crud"
Expand Down Expand Up @@ -118,7 +117,6 @@ route_table_config_list = [
}
]


aks_cli_config_list = [
{
role = "nap"
Expand Down
4 changes: 2 additions & 2 deletions steps/engine/clusterloader2/autoscale/collect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ steps:
- script: |
set -eo pipefail

PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect $CPU_PER_NODE ${CAPACITY_TYPE:-on-demand} $NODE_COUNT $POD_COUNT \
$CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE
PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect ${CPU_PER_NODE:-0} ${CAPACITY_TYPE:-on-demand} ${NODE_COUNT:-0} ${POD_COUNT:-0} \
$CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_CONFIG_FILE} --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""}
workingDirectory: modules/python
env:
CLOUD: ${{ parameters.cloud }}
Expand Down
7 changes: 4 additions & 3 deletions steps/engine/clusterloader2/autoscale/execute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ steps:
set -eo pipefail

PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE override \
$CPU_PER_NODE $NODE_COUNT $POD_COUNT \
${CPU_PER_NODE:-0} ${NODE_COUNT:-0} ${POD_COUNT:-0} \
$SCALE_UP_TIMEOUT $SCALE_DOWN_TIMEOUT \
$LOOP_COUNT "$NODE_LABEL_SELECTOR" "$NODE_SELECTOR" ${CL2_CONFIG_DIR}/overrides.yaml ${WARMUP_DEPLOYMENT:-false} ${CL2_CONFIG_DIR} --os_type ${OS_TYPE:-linux} --warmup_deployment_template ${WARMUP_DEPLOYMENT_TEMPLATE:-""} --deployment_template ${DEPLOYMENT_TEMPLATE:-""}
$LOOP_COUNT "${NODE_LABEL_SELECTOR:-""}" "$NODE_SELECTOR" ${CL2_CONFIG_DIR}/overrides.yaml ${WARMUP_DEPLOYMENT:-false} ${CL2_CONFIG_DIR} --os_type ${OS_TYPE:-linux} --warmup_deployment_template ${WARMUP_DEPLOYMENT_TEMPLATE:-""} --deployment_template ${DEPLOYMENT_TEMPLATE:-""} \
--pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} --cl2_config_file ${CL2_CONFIG_FILE:-config.yaml}
PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \
${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD
${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD --cl2_config_file ${CL2_CONFIG_FILE:-config.yaml}
workingDirectory: modules/python
env:
${{ if eq(parameters.cloud, 'azure') }}:
Expand Down
Loading
Loading