Merge pull request #8853 from DorWeinstock/add-intel-gaudi-support

k8s-ci-robot · web-flow · commit ffcbfeee4209 · 2025-11-26T15:24:20.000-08:00
Add Intel GPU (Habana Gaudi) autoscaler support
diff --git a/cluster-autoscaler/processors/customresources/gpu_processor.go b/cluster-autoscaler/processors/customresources/gpu_processor.go
@@ -48,9 +48,8 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(autosca
 		}
 
 		_, hasGpuLabel := node.Labels[autoscalingCtx.CloudProvider.GPULabel()]
-		gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
-		directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
-		if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
+		_, hasAnyGpuAllocatable := gpu.NodeHasGpuAllocatable(node)
+		if hasGpuLabel && !hasAnyGpuAllocatable {
 			klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
 				node.Name)
 			nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
@@ -88,9 +87,8 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
 		return CustomResourceTarget{}, nil
 	}
 
-	gpuAllocatable, found := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
-	if found && gpuAllocatable.Value() > 0 {
-		return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil
+	if gpuAllocatableValue, hasGpuAllocatable := gpu.NodeHasGpuAllocatable(node); hasGpuAllocatable {
+		return CustomResourceTarget{gpuLabel, gpuAllocatableValue}, nil
 	}
 
 	// A node is supposed to have GPUs (based on label), but they're not available yet
@@ -115,8 +113,10 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(autoscalingCtx *ca_contex
 		klog.Errorf("Failed to build template for getting GPU estimation for node %v: %v", node.Name, err)
 		return CustomResourceTarget{}, errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}
-	if gpuCapacity, found := template.Node().Status.Capacity[gpu.ResourceNvidiaGPU]; found {
-		return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
+	for _, gpuVendorResourceName := range gpu.GPUVendorResourceNames {
+		if gpuCapacity, found := template.Node().Status.Capacity[gpuVendorResourceName]; found {
+			return CustomResourceTarget{gpuLabel, gpuCapacity.Value()}, nil
+		}
 	}
 
 	// if template does not define gpus we assume node will not have any even if ith has gpu label
diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go
@@ -26,6 +26,8 @@ import (
 )
 
 const (
+	// ResourceIntelGaudi is the name of the Intel Gaudi resource.
+	ResourceIntelGaudi = "habana.ai/gaudi"
 	// ResourceAMDGPU is the name of the AMD GPU resource.
 	ResourceAMDGPU = "amd.com/gpu"
 	// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
@@ -41,6 +43,7 @@ const (
 // Extend this slice if new vendor resource names are added.
 var GPUVendorResourceNames = []apiv1.ResourceName{
 	ResourceNvidiaGPU,
+	ResourceIntelGaudi,
 	ResourceAMDGPU,
 	ResourceDirectX,
 }
@@ -123,13 +126,20 @@ func NodeHasGpu(GPULabel string, node *apiv1.Node) bool {
 		return true
 	}
 	// Check for extended resources as well
+	_, hasGpuAllocatable := NodeHasGpuAllocatable(node)
+	return hasGpuAllocatable
+}
+
+// NodeHasGpuAllocatable returns the GPU allocatable value and whether the node has GPU allocatable resources.
+// It checks all known GPU vendor resource names and returns the first non-zero allocatable GPU value found.
+func NodeHasGpuAllocatable(node *apiv1.Node) (gpuAllocatableValue int64, hasGpuAllocatable bool) {
 	for _, gpuVendorResourceName := range GPUVendorResourceNames {
-		gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpuVendorResourceName]
-		if hasGpuAllocatable && !gpuAllocatable.IsZero() {
-			return true
+		gpuAllocatable, found := node.Status.Allocatable[gpuVendorResourceName]
+		if found && !gpuAllocatable.IsZero() {
+			return gpuAllocatable.Value(), true
 		}
 	}
-	return false
+	return 0, false
 }
 
 // PodRequestsGpu returns true if a given pod has GPU request.