Add Intel GPU (Habana Gaudi) autoscaler support

DorWeinstock · DorWeinstock · commit 5873c7f81461 · 2025-11-25T19:41:46.000+02:00
Add support for Intel Habana Gaudi GPUs in the cluster autoscaler by:
- Define ResourceIntelGPU resource name (habana.ai/gaudi)
- Add Intel GPU to GPUVendorResourceNames list
- Refactor GPU detection logic to iterate through all GPU vendor resource names
    instead of checking vendors individually

This enables the autoscaler to properly detect and handle Intel GPU nodes
alongside existing NVIDIA, AMD, and DirectX GPU support.
diff --git a/cluster-autoscaler/processors/customresources/gpu_processor.go b/cluster-autoscaler/processors/customresources/gpu_processor.go
@@ -48,9 +48,15 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(autosca
 		}
 
 		_, hasGpuLabel := node.Labels[autoscalingCtx.CloudProvider.GPULabel()]
-		gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
-		directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
-		if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
+		hasAnyGpuAllocatable := false
+		for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
+			gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName]
+			if hasGpuAllocatable && !gpuAllocatable.IsZero() {
+				hasAnyGpuAllocatable = true
+				break
+			}
+		}
+		if hasGpuLabel && !hasAnyGpuAllocatable {
 			klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
 				node.Name)
 			nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
@@ -88,9 +94,10 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
 		return CustomResourceTarget{}, nil
 	}
 
-	gpuAllocatable, found := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
-	if found && gpuAllocatable.Value() > 0 {
-		return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil
+	for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
+		if gpuAllocatable, found := node.Status.Allocatable[gpuVendorResourceName]; found && gpuAllocatable.Value() > 0 {
+			return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil
+		}
 	}
 
 	// A node is supposed to have GPUs (based on label), but they're not available yet
@@ -115,8 +122,10 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
 		klog.Errorf("Failed to build template for getting GPU estimation for node %v: %v", node.Name, err)
 		return CustomResourceTarget{}, errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}
-	if gpuCapacity, found := template.Node().Status.Capacity[gpu.ResourceNvidiaGPU]; found {
-		return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
+	for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
+		if gpuCapacity, found := template.Node().Status.Capacity[gpuVendorResourceName]; found {
+			return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
+		}
 	}
 
 	// if template does not define gpus we assume node will not have any even if ith has gpu label
diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go
@@ -26,6 +26,8 @@ import (
 )
 
 const (
+	// ResourceIntelGaudi is the name of the Intel Gaudi resource.
+	ResourceIntelGaudi = "habana.ai/gaudi"
 	// ResourceAMDGPU is the name of the AMD GPU resource.
 	ResourceAMDGPU = "amd.com/gpu"
 	// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
@@ -41,6 +43,7 @@ const (
 // Extend this slice if new vendor resource names are added.
 var GPUVendorResourceNames = []apiv1.ResourceName{
 	ResourceNvidiaGPU,
+	ResourceIntelGaudi,
 	ResourceAMDGPU,
 	ResourceDirectX,
 }