Skip to content

Commit e937903

Browse files
committed
Add multithreaded tokenizer test
1 parent 007fc76 commit e937903

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

bindings/python/tests/bindings/test_tokenizer.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,44 @@ def test_multiprocessing_with_parallelism(self):
557557
multiprocessing_with_parallelism(tokenizer, False)
558558
multiprocessing_with_parallelism(tokenizer, True)
559559

560+
def test_multithreaded_concurrency(self):
561+
562+
# Thread worker functions
563+
def encode_batch(batch):
564+
tokenizer = Tokenizer(BPE())
565+
return tokenizer.encode_batch(batch)
566+
567+
def encode_batch_fast(batch):
568+
tokenizer = Tokenizer(BPE())
569+
return tokenizer.encode_batch_fast(batch)
570+
571+
# Create some significant workload
572+
batches = [
573+
["my name is john " * 50] * 20,
574+
["my name is paul " * 50] * 20,
575+
["my name is ringo " * 50] * 20,
576+
]
577+
578+
# Many encoding operations to run concurrently
579+
tasks = [
580+
(encode_batch, batches[0]),
581+
(encode_batch_fast, batches[1]),
582+
(encode_batch, batches[2]),
583+
] * 10
584+
585+
executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
586+
587+
futures = []
588+
for task in tasks:
589+
futures.append(executor.submit(*task))
590+
591+
# All tasks should complete successfully
592+
results = [f.result() for f in futures]
593+
594+
# Verify results
595+
assert len(results) == 30
596+
assert all(len(result) == 20 for result in results)
597+
560598
def test_from_pretrained(self):
561599
tokenizer = Tokenizer.from_pretrained("bert-base-cased")
562600
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)

0 commit comments

Comments
 (0)