rm saving behavior

juncgu-google · juncgu-google · commit a5ec87d47647 · 2025-12-06T04:50:30.000Z
Signed-off-by: Juncheng Gu &lt;jcgu@google.com&gt;
diff --git a/tpu_inference/distributed/offload/tpu_offload_connector.py b/tpu_inference/distributed/offload/tpu_offload_connector.py
@@ -128,9 +128,6 @@
 # kv cache layout needed by cpu offloading mechanism
 REQUIRED_KV_CACHE_LAYOUT = "NHD"
 
-# default swap op type
-DEFAULT_HOST_HBM_SWAP_OP_TYPE = "jax"
-
 BLOCK_SIZE_BUCKETS = [1, 2, 4, 8, 16]
 
 # we keep our operations at vllm's block granularity,
@@ -139,9 +136,7 @@
 # 1. [supported] drop: drop the entire partial block
 # 2. pad: pad to a full block
 # 3. dynamic: keep the partial block as is.
-PARTIAL_BLOCK_SAVE_BEHAVIOR = Literal["drop", "pad", "dynamic"]
-
-DEFAULT_TPU_OFFLOAD_CPU_CHUNKS = 1024
+PARTIAL_BLOCK_SAVE_BEHAVIOR = Literal["drop"]
 
 
 @dataclass
@@ -512,24 +507,7 @@ def __init__(self, vllm_config: "VllmConfig"):
         #     real-chunk-size in save and load
         self.cpu_chunk_size = self.block_size
 
-        # TODO(jcgu): rm
-        # define partial_block saving behavior
-        self.partial_block_save_behavior: PARTIAL_BLOCK_SAVE_BEHAVIOR = \
-            os.getenv("TPU_OFFLOAD_PARTIAL_BLOCK_SAVE_BEHAVIOR", "drop")
-        assert self.partial_block_save_behavior in get_args(
-            PARTIAL_BLOCK_SAVE_BEHAVIOR
-        ), f"{self.partial_block_save_behavior} not in {get_args(PARTIAL_BLOCK_SAVE_BEHAVIOR)}"
-        self.partial_block_dynamic_pad_lower_limit = \
-            int(os.getenv("TPU_OFFLOAD_PARTIAL_BLOCK_DYNAMIC_PAD_LOWER_LIMIT", "0"))
-        if self.partial_block_save_behavior == "dynamic":
-            if self.partial_block_dynamic_pad_lower_limit <= 0:
-                self.partial_block_save_behavior == "drop"
-            elif self.partial_block_dynamic_pad_lower_limit >= self.block_size:
-                self.partial_block_save_behavior == "pad"
-        logger.info(
-            f" partial_block_save_behavior is configed to {self.partial_block_save_behavior}, but we only support drop now."
-        )
-        self.partial_block_save_behavior = "drop"
+        self.partial_block_save_behavior: PARTIAL_BLOCK_SAVE_BEHAVIOR = "drop"
 
         # config staging buffer
         # NOTE(jcgu): Need to find a way to grab page_size_bytes in scheduler
@@ -547,7 +525,6 @@ def __init__(self, vllm_config: "VllmConfig"):
             f"model_name={model_name}, "
             f"decode_save={self.decode_save}, "
             f"partial_block_save_behavior={self.partial_block_save_behavior}, "
-            f"partial_block_dynamic_pad_lower_limit={self.partial_block_dynamic_pad_lower_limit}, "
             f"num_staging_blocks={self.num_staging_blocks}.")
 
     def _get_request_block_hashes(self, req: "Request") -> list[BlockHash]:
@@ -668,27 +645,6 @@ def get_num_new_matched_tokens(
         # external_computed_tokens, load_kv_async
         return num_to_load, False
 
-    def _adjust_last_partial_block(self,
-                                   last_partial_block_num_tokens: int) -> bool:
-        """
-        adjust prompt token / len based on pre-configed save behavior
-        when the last block of request's token is partially used.
-        In order to keep all the saved kv be aligned with block_size,
-        we may
-         1. drop the partial block
-         2. pad the partial block to be a full block
-         3. drop or pad based on actual num_tokens in the last partial block
-
-        Input: num of tokens in the last partial block (could be 0)
-        Output: the last partial block should be kept (True) or dropped (False)
-        """
-        if self.partial_block_save_behavior == "pad":
-            return True if last_partial_block_num_tokens > 0 else False
-        elif self.partial_block_save_behavior == "drop":
-            return False
-        elif self.partial_block_save_behavior == "dynamic":
-            return True if last_partial_block_num_tokens >= self.partial_block_dynamic_pad_lower_limit else False
-
     def update_state_after_alloc(self, request: "Request",
                                  blocks: "KVCacheBlocks",
                                  num_external_tokens: int):
diff --git a/tpu_inference/distributed/offload/utils.py b/tpu_inference/distributed/offload/utils.py
@@ -25,8 +25,6 @@
 
 CPU_OFFLOADING_SWAP_OP_TYPE = Literal["jax", "pallas"]
 
-DEFAULT_TPU_OFFLOAD_STAGING_BUFFER_TOKENS = 8192
-
 
 @dataclass(order=True)
 class CacheKey:
@@ -110,10 +108,6 @@ def get_kv_connector_cache_layout():
     return None
 
 
-def get_default_kv_connector_staging_buffer_tokens() -> int:
-    return DEFAULT_TPU_OFFLOAD_STAGING_BUFFER_TOKENS
-
-
 SwapFn = Callable[
     [
         List[jax.Array],  # src_kv_caches
diff --git a/tpu_inference/worker/tpu_worker.py b/tpu_inference/worker/tpu_worker.py
@@ -26,8 +26,6 @@
 
 from tpu_inference import envs, utils
 from tpu_inference.distributed import jax_parallel_state
-from tpu_inference.distributed.offload.utils import \
-    get_default_kv_connector_staging_buffer_tokens
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_transfer_port,
                                              get_node_id)
 from tpu_inference.layers.common.sharding import ShardingConfigManager
@@ -296,11 +294,7 @@ def determine_available_memory(self) -> int:
             kv_transfer_config = self.vllm_config.kv_transfer_config
             if kv_transfer_config.kv_connector == "TPUOffloadConnector" and kv_transfer_config.kv_connector_module_path == "tpu_inference.distributed.offload.tpu_offload_connector":
                 # If kv offloading is enabled, we need to account for the memory used by the KV transfer buffer.
-                _default_staging_buffer_tokens = get_default_kv_connector_staging_buffer_tokens(
-                )
-                staging_buffer_tokens = int(
-                    os.getenv("TPU_OFFLOAD_STAGING_BUFFER_TOKENS",
-                              str(_default_staging_buffer_tokens)))
+                staging_buffer_tokens = envs.TPU_OFFLOAD_STAGING_BUFFER_TOKENS
                 # calculate staging buffer size
                 staging_buffer_pages = staging_buffer_tokens // self.vllm_config.cache_config.block_size