[Misc] Update Attention backend registry (#1215)

kyuyeunk · web-flow · commit 4db50f550fa4 · 2025-12-03T01:41:40.000-08:00
diff --git a/tpu_inference/platforms/tpu_platform.py b/tpu_inference/platforms/tpu_platform.py
@@ -16,15 +16,15 @@
 from tpu_inference.utils import to_jax_dtype, to_torch_dtype
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import BlockSize, ModelConfig, VllmConfig
     from vllm.pooling_params import PoolingParams
 else:
     BlockSize = None
     ModelConfig = None
     VllmConfig = None
     PoolingParams = None
-    _Backend = None
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -48,13 +48,13 @@ class TpuPlatform(Platform):
     ]
 
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend: "_Backend", head_size: int,
-                             dtype: jnp.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool, use_mla: bool,
-                             has_sink: bool, use_sparse: bool,
-                             attn_type: Any) -> str:
-        from vllm.attention.backends.registry import _Backend
-        if selected_backend != _Backend.PALLAS:
+    def get_attn_backend_cls(cls, selected_backend: "AttentionBackendEnum",
+                             head_size: int, dtype: jnp.dtype,
+                             kv_cache_dtype: Optional[str], block_size: int,
+                             use_v1: bool, use_mla: bool, has_sink: bool,
+                             use_sparse: bool, attn_type: Any) -> str:
+        from vllm.attention.backends.registry import AttentionBackendEnum
+        if selected_backend != AttentionBackendEnum.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
 
         if use_v1: