Change sampling params to configrable

dannawang0221 · dannawang0221 · commit 5dfe210696a6 · 2025-12-04T21:50:54.000Z
Signed-off-by: dannawang &lt;dannawang@google.com&gt;
diff --git a/examples/offload/offline_inference_kv_cache_verification.py b/examples/offload/offline_inference_kv_cache_verification.py
@@ -38,6 +38,11 @@ def create_parser():
     parser.set_defaults(model="meta-llama/Llama-3.1-8B")
     parser.set_defaults(max_model_len=1024)
 
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
     return parser
 
 
@@ -46,14 +51,24 @@ def setup_llm(llm_args: dict) -> Tuple[LLM, SamplingParams]:
     Initializes a vLLM engine and sampling parameters from the given args.
     """
     args_copy = copy.deepcopy(llm_args)
+    # Pop arguments not used by LLM
+    max_tokens = args_copy.pop("max_tokens")
+    top_p = args_copy.pop("top_p")
+    top_k = args_copy.pop("top_k")
+
     # Create an LLM. The --seed argument is passed in via **args.
     llm = LLM(**args_copy)
 
-    # Create a sampling params
-    sampling_params = SamplingParams(temperature=0,
-                                     max_tokens=20,
-                                     seed=42,
-                                     ignore_eos=True)
+    # Create a sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    sampling_params.temperature = 0
+    sampling_params.ignore_eos = True
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
 
     return llm, sampling_params