Support vllm serve (#21)

aerdem4 · web-flow · commit c9f20058f5ae · 2025-06-02T18:43:54.000+03:00
Signed-off-by: aerdem4 &lt;ahmeterd4@gmail.com&gt;
diff --git a/example_notebooks/vllm/vllm_serve.ipynb b/example_notebooks/vllm/vllm_serve.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "59f98cf9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/aerdem/projects/nvidia/logits-processor-zoo\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd ../.."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f2a86616",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run vllm serve like this:\n",
+    "# vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype auto --api-key lpz-test --logits-processor-pattern \"logits_processor_zoo.vllm\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "13f407ff",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fried rice chicken is a popular Chinese dish that combines the flavors of fried rice with the tender texture and juicy meat of chicken. Here's a basic recipe to help you make it at home:\n",
+      "\n",
+      "### Ingredients:\n",
+      "- 1 pound boneless skinless chicken breast or thighs (cut into bite-sized pieces)\n",
+      "- 2 tablespoons vegetable oil\n",
+      "- 3 cloves garlic, minced\n",
+      "- 1 tablespoon ginger, grated\n",
+      "- 1/4 cup soy sauce\n",
+      "- 1/4 cup oyster sauce\n",
+      "- 1 teaspoon sugar\n",
+      "- 1/2 teaspoon salt\n",
+      "- 1/4 teaspoon black pepper\n",
+      "- 1 can (8 oz) condensed cream of mushroom soup\n",
+      "- 1 cup frozen mixed vegetables (such as peas, carrots, corn)\n",
+      "- 1/2 cup chopped green onions\n",
+      "- 1/4 cup chopped cilantro\n",
+      "\n",
+      "### Instructions:\n",
+      "\n",
+      "#### Step 1: Prepare the Chicken\n",
+      "1. **Marinate the Chicken:** In a bowl, mix together the chicken, soy sauce, oyster sauce, sugar, salt, and black pepper.\n",
+      "2. **Cook the Chicken:** Heat the vegetable oil in a large skillet over medium-high heat. Add the marinated chicken and cook until browned on all sides, about 5 minutes per side. Remove from the pan and set aside.\n",
+      "\n",
+      "#### Step 2: Cook the Vegetables\n",
+      "1. **Sauté the Vegetables:** In the same skillet, add the remaining 1 tablespoon of oil. Sauté the minced garlic and grated ginger for about 30 seconds until fragrant.\n",
+      "2. **Add the Mixed Vegetables:** Stir in the frozen mixed vegetables and sauté until they start to soften, about 2-3 minutes.\n",
+      "3. **Combine Everything:** Return the cooked chicken to the skillet along with the sautéed vegetables. Pour in the condensed cream of mushroom soup and stir well to combine everything.\n",
+      "\n",
+      "#### Step 3: Finish Cooking\n",
+      "1. **Simmer the Sauce:** Bring the mixture to a simmer over low heat. Let it cook for about 5 minutes, stirring occasionally, until the sauce thickens slightly.\n",
+      "2. **Serve:** Garnish with chopped green onions and cilantro before serving. This dish can be served hot or cold depending on your preference.\n",
+      "\n",
+      "Enjoy your homemade fried rice chicken! Adjust the seasoning according to your taste preferences.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "model_name = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
+    "\n",
+    "client = OpenAI(\n",
+    "    base_url=\"http://localhost:8000/v1\",\n",
+    "    api_key=\"lpz-test\",\n",
+    ")\n",
+    "\n",
+    "completion = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"Can you explain how fried rice chicken is cooked?\"}\n",
+    "    ],    \n",
+    "    temperature=0,\n",
+    "    top_p=1\n",
+    ")\n",
+    "\n",
+    "print(completion.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6227231c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fried rice chicken is a popular Chinese dish that combines the flavors of fried rice with the tender texture and juicy meat of chicken. Here's a basic recipe to help you make it at home:\n",
+      "\n",
+      "### Ingredients:\n",
+      "- 1 pound boneless skinless chicken breast or thighs (cut into bite-sized pieces)\n",
+      "- 2 tablespoons vegetable oil\n",
+      "- 3 cloves garlic, minced\n",
+      "- 1 tablespoon ginger, grated\n",
+      "- 1/4 cup soy sauce\n",
+      "- 1/4 cup oyster sauce\n",
+      "- 1 teaspoon sugar\n",
+      "- 1/2 teaspoon salt\n",
+      "- 1/4 teaspoon black pepper\n",
+      "- 1 can (8 oz) condensed cream of mushroom soup\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "completion = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"Can you explain how fried rice chicken is cooked?\"}\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    top_p=1,\n",
+    "    extra_body={\n",
+    "        \"logits_processors\": [{\n",
+    "            \"qualname\": \"logits_processor_zoo.vllm.GenLengthLogitsProcessor\",\n",
+    "            \"kwargs\": {\"tokenizer\": model_name, \"boost_factor\": 0.2, \"complete_sentences\": True}\n",
+    "        }]\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(completion.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96544ec2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/logits_processor_zoo/vllm/cite_prompt.py b/logits_processor_zoo/vllm/cite_prompt.py
@@ -15,9 +15,9 @@
 # limitations under the License.
 #
 
-from typing import List
+from typing import List, Union
 import torch
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, AutoTokenizer
 
 
 class CiteFromPromptLogitsProcessor:
@@ -33,11 +33,14 @@ class CiteFromPromptLogitsProcessor:
     boost_eos (bool, optional): If True, boosts EOS token too.
     conditional_boost_factor (float, optional): A factor to boost the likelihood of the tokens based on previous token.
     """
-    def __init__(self, tokenizer: PreTrainedTokenizer, boost_factor: float = 1.0, boost_eos: bool = True,
+    def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], boost_factor: float = 1.0, boost_eos: bool = True,
                  conditional_boost_factor: float = 0.0):
         self.tokenizer = tokenizer
+        if isinstance(self.tokenizer, str):
+            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
+
         self.boost_factor = boost_factor
-        self.eos_token_id = tokenizer.eos_token_id
+        self.eos_token_id = self.tokenizer.eos_token_id
         self.boost_eos = boost_eos
         self.conditional_boost_factor = conditional_boost_factor
 
diff --git a/logits_processor_zoo/vllm/generation_length.py b/logits_processor_zoo/vllm/generation_length.py
@@ -15,9 +15,9 @@
 # limitations under the License.
 #
 
-from typing import List
+from typing import List, Union
 import torch
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, AutoTokenizer
 from logits_processor_zoo.utils import text_to_token
 
 
@@ -36,18 +36,22 @@ class GenLengthLogitsProcessor:
                                         or a new line. Default is False.
     boost_token_str (str, optional): A string to be tokenized and used instead of EOS. Especially useful for </think>.
     """
-    def __init__(self, tokenizer: PreTrainedTokenizer, boost_factor: float,
+    def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], boost_factor: float,
                  p: int = 2, complete_sentences: bool = False, boost_token_str: str = None):
-        self.boost_token = tokenizer.eos_token_id
+
+        self.tokenizer = tokenizer
+        if isinstance(self.tokenizer, str):
+            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
+
+        self.boost_token = self.tokenizer.eos_token_id
         self.boost_token_str = boost_token_str
         if boost_token_str is not None:
-            self.boost_token = text_to_token(tokenizer, boost_token_str, last=False)
+            self.boost_token = text_to_token(self.tokenizer, boost_token_str, last=False)
         self.boost_factor = boost_factor
         self.p = p
-        self.full_stop_token = text_to_token(tokenizer, "It is a sentence.", last=True)
-        self.new_line_token = text_to_token(tokenizer, "It is a new line\n", last=True)
+        self.full_stop_token = text_to_token(self.tokenizer, "It is a sentence.", last=True)
+        self.new_line_token = text_to_token(self.tokenizer, "It is a new line\n", last=True)
         self.complete_sentences = complete_sentences
-        self.tokenizer = tokenizer
 
     def clone(self):
         return GenLengthLogitsProcessor(self.tokenizer, self.boost_factor, self.p,
diff --git a/logits_processor_zoo/vllm/last_phrase.py b/logits_processor_zoo/vllm/last_phrase.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 #
 
-from transformers import PreTrainedTokenizer
-from typing import List
+from transformers import PreTrainedTokenizer, AutoTokenizer
+from typing import List, Union
 import torch
 from logits_processor_zoo.utils import enforce_tokens
 
@@ -31,12 +31,15 @@ class ForceLastPhraseLogitsProcessor:
     phrase (str): The phrase to be generated by LLM before the end of its speech.
     tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM.
     """
-    def __init__(self, phrase: str, tokenizer: PreTrainedTokenizer):
-        self.eos_token_id = tokenizer.eos_token_id
-        self.phrase_tokens = tokenizer.encode(phrase, add_special_tokens=False)
+    def __init__(self, phrase: str, tokenizer: Union[PreTrainedTokenizer, str]):
+        self.tokenizer = tokenizer
+        if isinstance(self.tokenizer, str):
+            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
+
+        self.eos_token_id = self.tokenizer.eos_token_id
+        self.phrase_tokens = self.tokenizer.encode(phrase, add_special_tokens=False)
         self._reset()
         self.phrase = phrase
-        self.tokenizer = tokenizer
 
     # LogitsProcessor can contain a clone attribute to deep copy it
     # https://github.com/vllm-project/vllm/blob/19dcc02a72e3ed52e3bf95aae44ea1f40ce42ea0/vllm/sampling_params.py#L537-L550
diff --git a/logits_processor_zoo/vllm/multiple_choice.py b/logits_processor_zoo/vllm/multiple_choice.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 #
 
-from transformers import PreTrainedTokenizer
-from typing import List
+from transformers import PreTrainedTokenizer, AutoTokenizer
+from typing import List, Union
 import torch
 from logits_processor_zoo.utils import text_to_token, get_new_line_tokens, enforce_tokens
 
@@ -41,17 +41,20 @@ class MultipleChoiceLogitsProcessor:
     boost_first_words (float): Nonzero values add choices' first tokens' logits to boost performance.
                             Especially useful for the models which have difficulty associating the choice with its text.
     """
-    def __init__(self, tokenizer: PreTrainedTokenizer, choices: List[str] = None,
+    def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], choices: List[str] = None,
                  delimiter: str = ".", boost_first_words: float = 0.0):
         self.tokenizer = tokenizer
+        if isinstance(self.tokenizer, str):
+            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
+
         self.choices = choices
         self.delimiter = delimiter
         if choices is None:
             choices = ["1", "2", "3", "4"]
 
-        self.new_line_token = get_new_line_tokens(tokenizer)
-        self.delimiter_token = text_to_token(tokenizer, delimiter, last=False)
-        self.choice_tokens = [text_to_token(tokenizer, choice, last=False) for choice in choices]
+        self.new_line_token = get_new_line_tokens(self.tokenizer)
+        self.delimiter_token = text_to_token(self.tokenizer, delimiter, last=False)
+        self.choice_tokens = [text_to_token(self.tokenizer, choice, last=False) for choice in choices]
         self.boost_first_words = boost_first_words
 
     def clone(self):
diff --git a/logits_processor_zoo/vllm/trigger_phrase.py b/logits_processor_zoo/vllm/trigger_phrase.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 #
 
-from transformers import PreTrainedTokenizer
-from typing import List
+from transformers import PreTrainedTokenizer, AutoTokenizer
+from typing import List, Union
 import torch
 from logits_processor_zoo.utils import text_to_token, enforce_tokens
 
@@ -33,14 +33,17 @@ class TriggerPhraseLogitsProcessor:
     trigger_count (int): How many times the phrase will be triggered.
     trigger_after (bool): Whether the phrase is written after the trigger token or instead of the trigger token.
     """
-    def __init__(self, phrase: str, trigger_token_phrase: str, tokenizer: PreTrainedTokenizer, trigger_count: int = 1,
-                 trigger_after: bool = False):
+    def __init__(self, phrase: str, trigger_token_phrase: str, tokenizer: Union[PreTrainedTokenizer, str],
+                 trigger_count: int = 1, trigger_after: bool = False):
+        self.tokenizer = tokenizer
+        if isinstance(self.tokenizer, str):
+            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
+
         self.phrase = phrase
         self.trigger_token_phrase = trigger_token_phrase
-        self.tokenizer = tokenizer
         self.trigger_count = trigger_count
-        self.trigger_token = text_to_token(tokenizer, trigger_token_phrase, last=False)
-        self.phrase_tokens = tokenizer.encode(phrase, add_special_tokens=False)
+        self.trigger_token = text_to_token(self.tokenizer, trigger_token_phrase, last=False)
+        self.phrase_tokens = self.tokenizer.encode(phrase, add_special_tokens=False)
         self.initial_trigger_count = trigger_count
         self.trigger_after = trigger_after
         self._reset()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "logits-processor-zoo"
-version = "0.1.8"
+version = "0.1.9"
 description = "A collection of LogitsProcessors to customize and enhance LLM behavior for specific tasks."
 authors = ["Ahmet Erdem", "Ivan Sorokin", "Maximilian Jeblick", "Darragh Hanley", "David Austin"]
 readme = "README.md"