Skip to content

Commit 5873c7f

Browse files
committed
Add Intel GPU (Habana Gaudi) autoscaler support
Add support for Intel Habana Gaudi GPUs in the cluster autoscaler by: - Define ResourceIntelGPU resource name (habana.ai/gaudi) - Add Intel GPU to GPUVendorResourceNames list - Refactor GPU detection logic to iterate through all GPU vendor resource names instead of checking vendors individually This enables the autoscaler to properly detect and handle Intel GPU nodes alongside existing NVIDIA, AMD, and DirectX GPU support.
1 parent 7b95cb0 commit 5873c7f

File tree

2 files changed

+20
-8
lines changed

2 files changed

+20
-8
lines changed

cluster-autoscaler/processors/customresources/gpu_processor.go

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,15 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(autosca
4848
}
4949

5050
_, hasGpuLabel := node.Labels[autoscalingCtx.CloudProvider.GPULabel()]
51-
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
52-
directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
53-
if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
51+
hasAnyGpuAllocatable := false
52+
for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
53+
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName]
54+
if hasGpuAllocatable && !gpuAllocatable.IsZero() {
55+
hasAnyGpuAllocatable = true
56+
break
57+
}
58+
}
59+
if hasGpuLabel && !hasAnyGpuAllocatable {
5460
klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
5561
node.Name)
5662
nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
@@ -88,9 +94,10 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
8894
return CustomResourceTarget{}, nil
8995
}
9096

91-
gpuAllocatable, found := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
92-
if found && gpuAllocatable.Value() > 0 {
93-
return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil
97+
for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
98+
if gpuAllocatable, found := node.Status.Allocatable[gpuVendorResourceName]; found && gpuAllocatable.Value() > 0 {
99+
return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil
100+
}
94101
}
95102

96103
// A node is supposed to have GPUs (based on label), but they're not available yet
@@ -115,8 +122,10 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
115122
klog.Errorf("Failed to build template for getting GPU estimation for node %v: %v", node.Name, err)
116123
return CustomResourceTarget{}, errors.ToAutoscalerError(errors.CloudProviderError, err)
117124
}
118-
if gpuCapacity, found := template.Node().Status.Capacity[gpu.ResourceNvidiaGPU]; found {
119-
return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
125+
for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
126+
if gpuCapacity, found := template.Node().Status.Capacity[gpuVendorResourceName]; found {
127+
return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
128+
}
120129
}
121130

122131
// if template does not define gpus we assume node will not have any even if ith has gpu label

cluster-autoscaler/utils/gpu/gpu.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626
)
2727

2828
const (
29+
// ResourceIntelGaudi is the name of the Intel Gaudi resource.
30+
ResourceIntelGaudi = "habana.ai/gaudi"
2931
// ResourceAMDGPU is the name of the AMD GPU resource.
3032
ResourceAMDGPU = "amd.com/gpu"
3133
// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
@@ -41,6 +43,7 @@ const (
4143
// Extend this slice if new vendor resource names are added.
4244
var GPUVendorResourceNames = []apiv1.ResourceName{
4345
ResourceNvidiaGPU,
46+
ResourceIntelGaudi,
4447
ResourceAMDGPU,
4548
ResourceDirectX,
4649
}

0 commit comments

Comments
 (0)