fix: add clear_grammar to remove grammar from reused model_request

windreamer · windreamer · commit 0e26d72ed895 · 2025-12-10T21:04:15.000+08:00
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -814,6 +814,8 @@ async def async_stream_infer(self,
             while not state or state.status == 0:
                 await sem.acquire()
                 state = shared_state.consume()
+
+            self.model_inst.clear_grammar()
             logger.info(f'[async_stream_infer] session {session_id} done')
 
     def _get_error_output(self, status):
diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc
@@ -152,4 +152,9 @@ void ModelRequest::setGrammar(const xgrammar::CompiledGrammar& grammar)
     grammar_ = std::make_shared<xgrammar::CompiledGrammar>(grammar);
 }
 
+void ModelRequest::clearGrammar()
+{
+    grammar_.reset();
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h
@@ -41,6 +41,7 @@ class ModelRequest {
 
     OutputParam Forward(InputParam param, std::function<void()> cb);
     void        setGrammar(const xgrammar::CompiledGrammar& grammar);
+    void        clearGrammar();
 
 protected:
     Gateway* const gateway_;
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
@@ -498,7 +498,15 @@ PYBIND11_MODULE(_turbomind, m)
                 model_request->setGrammar(grammar);
             },
             py::call_guard<py::gil_scoped_release>(),
-            "grammar"_a);
+            "grammar"_a)
+        .def(
+            "clear_grammar",
+            [](ModelRequest* model_request) {
+                TM_LOG_INFO("Release grammar for model_request");
+                model_request->clearGrammar();
+            },
+            py::call_guard<py::gil_scoped_release>());
+
 
     // transformer model
     using ft::LlamaTritonModel;

Original file line number	Diff line number	Diff line change
`@@ -152,4 +152,9 @@ void ModelRequest::setGrammar(const xgrammar::CompiledGrammar& grammar)`
`152`	`152`	`grammar_ = std::make_shared<xgrammar::CompiledGrammar>(grammar);`
`153`	`153`	`}`
`154`	`154`
	`155`	`+void ModelRequest::clearGrammar()`
	`156`	`+{`
	`157`	`+ grammar_.reset();`
	`158`	`+}`
	`159`	`+`
`155`	`160`	`} // namespace turbomind`