Skip to content

Commit 508d9b8

Browse files
Fix tokenizer accounting for bos token
Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
1 parent 4e32810 commit 508d9b8

File tree

2 files changed

+9
-3
lines changed

2 files changed

+9
-3
lines changed

logits_processor_zoo/utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,14 @@
2323
def text_to_token(tokenizer: PreTrainedTokenizer, text: str, last: bool):
2424
tokens = tokenizer.encode(text, add_special_tokens=False)
2525

26-
if not last and len(tokens) > 2:
27-
# Usually the first token indicates the beginning, and the second token is our main token
26+
# We allow 2 tokens to account for the BOS or prefix token
27+
max_token_count = 1
28+
bos_token_added = getattr(tokenizer, 'bos_token', None) and getattr(tokenizer, 'bos_token_id', None) in tokens
29+
prefix_token_added = getattr(tokenizer, 'add_prefix_space', None) is not False
30+
if bos_token_added or prefix_token_added:
31+
max_token_count = 2
32+
33+
if not last and len(tokens) > max_token_count:
2834
raise Exception(f"Can't convert {text} to token. It has {len(tokens)} tokens.")
2935

3036
return tokens[-1]

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "logits-processor-zoo"
3-
version = "0.1.11"
3+
version = "0.1.12"
44
description = "A collection of LogitsProcessors to customize and enhance LLM behavior for specific tasks."
55
authors = ["Ahmet Erdem", "Ivan Sorokin", "Maximilian Jeblick", "Darragh Hanley", "David Austin"]
66
readme = "README.md"

0 commit comments

Comments
 (0)