Skip to content

Commit ffcbfee

Browse files
authored
Merge pull request #8853 from DorWeinstock/add-intel-gaudi-support
Add Intel GPU (Habana Gaudi) autoscaler support
2 parents 7b95cb0 + cc49907 commit ffcbfee

File tree

2 files changed

+22
-12
lines changed

2 files changed

+22
-12
lines changed

cluster-autoscaler/processors/customresources/gpu_processor.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(autosca
4848
}
4949

5050
_, hasGpuLabel := node.Labels[autoscalingCtx.CloudProvider.GPULabel()]
51-
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
52-
directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
53-
if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
51+
_, hasAnyGpuAllocatable := gpu.NodeHasGpuAllocatable(node)
52+
if hasGpuLabel && !hasAnyGpuAllocatable {
5453
klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
5554
node.Name)
5655
nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
@@ -88,9 +87,8 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
8887
return CustomResourceTarget{}, nil
8988
}
9089

91-
gpuAllocatable, found := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
92-
if found && gpuAllocatable.Value() > 0 {
93-
return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil
90+
if gpuAllocatableValue, hasGpuAllocatable := gpu.NodeHasGpuAllocatable(node); hasGpuAllocatable {
91+
return CustomResourceTarget{gpuLabel, gpuAllocatableValue}, nil
9492
}
9593

9694
// A node is supposed to have GPUs (based on label), but they're not available yet
@@ -115,8 +113,10 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
115113
klog.Errorf("Failed to build template for getting GPU estimation for node %v: %v", node.Name, err)
116114
return CustomResourceTarget{}, errors.ToAutoscalerError(errors.CloudProviderError, err)
117115
}
118-
if gpuCapacity, found := template.Node().Status.Capacity[gpu.ResourceNvidiaGPU]; found {
119-
return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
116+
for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
117+
if gpuCapacity, found := template.Node().Status.Capacity[gpuVendorResourceName]; found {
118+
return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
119+
}
120120
}
121121

122122
// if template does not define gpus we assume node will not have any even if ith has gpu label

cluster-autoscaler/utils/gpu/gpu.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626
)
2727

2828
const (
29+
// ResourceIntelGaudi is the name of the Intel Gaudi resource.
30+
ResourceIntelGaudi = "habana.ai/gaudi"
2931
// ResourceAMDGPU is the name of the AMD GPU resource.
3032
ResourceAMDGPU = "amd.com/gpu"
3133
// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
@@ -41,6 +43,7 @@ const (
4143
// Extend this slice if new vendor resource names are added.
4244
var GPUVendorResourceNames = []apiv1.ResourceName{
4345
ResourceNvidiaGPU,
46+
ResourceIntelGaudi,
4447
ResourceAMDGPU,
4548
ResourceDirectX,
4649
}
@@ -123,13 +126,20 @@ func NodeHasGpu(GPULabel string, node *apiv1.Node) bool {
123126
return true
124127
}
125128
// Check for extended resources as well
129+
_, hasGpuAllocatable := NodeHasGpuAllocatable(node)
130+
return hasGpuAllocatable
131+
}
132+
133+
// NodeHasGpuAllocatable returns the GPU allocatable value and whether the node has GPU allocatable resources.
134+
// It checks all known GPU vendor resource names and returns the first non-zero allocatable GPU value found.
135+
func NodeHasGpuAllocatable(node *apiv1.Node) (gpuAllocatableValue int64, hasGpuAllocatable bool) {
126136
for _, gpuVendorResourceName := range GPUVendorResourceNames {
127-
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName]
128-
if hasGpuAllocatable && !gpuAllocatable.IsZero() {
129-
return true
137+
gpuAllocatable, found := node.Status.Allocatable[gpuVendorResourceName]
138+
if found && !gpuAllocatable.IsZero() {
139+
return gpuAllocatable.Value(), true
130140
}
131141
}
132-
return false
142+
return 0, false
133143
}
134144

135145
// PodRequestsGpu returns true if a given pod has GPU request.

0 commit comments

Comments
 (0)