forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllama31_python.py
More file actions
95 lines (76 loc) · 2.63 KB
/
llama31_python.py
File metadata and controls
95 lines (76 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import transformers
import torch
import time
import threading
from transformers import TextIteratorStreamer
model_id = "/data1/cl/weights/llama3.1_8b_instruct_st"
DEVICE_MAP = "auto"
DTYPE = torch.bfloat16
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
tokenizer=tokenizer,
model_kwargs={"torch_dtype": DTYPE},
device_map=DEVICE_MAP,
)
prompts = [
"Write a detailed 800-word sci-fi story about AI on Mars.",
"Create a comprehensive timeline of WWII with 20 events.",
"Write a Python Snake game using Pygame with comments.",
"Guide on building a wooden bookshelf from scratch.",
"Room with 3 killers puzzle. Explain step-by-step.",
"10 mins to dry 1 shirt, how long for 50? Explain.",
"Philosophy: Free Will vs Determinism.",
"Generate a JSON dataset of 10 users.",
"Summarize the Transformer architecture paper (Attention Is All You Need).",
"Translate 'The Transformer architecture relies entirely on attention' to Chinese."
]
def benchmark_decode(prompt, index, is_warmup=False):
messages = [{"role": "user", "content": prompt}]
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
text_inputs=messages,
streamer=streamer,
max_new_tokens=256,
do_sample=True,
temperature=0.6,
)
thread = threading.Thread(target=pipeline, kwargs=generation_kwargs)
torch.cuda.synchronize()
thread.start()
first_token_time = None
generated_text = ""
for new_text in streamer:
if first_token_time is None:
torch.cuda.synchronize()
first_token_time = time.time()
generated_text += new_text
torch.cuda.synchronize()
end_time = time.time()
thread.join()
output_tokens = len(tokenizer.encode(generated_text, add_special_tokens=False))
if output_tokens > 1 and first_token_time:
duration = end_time - first_token_time
speed = (output_tokens - 1) / duration
else:
speed = 0
if not is_warmup:
print(f"Decode Speed: {speed:.2f} tokens/s")
return speed
return 0
print("Warmup...")
for i, p in enumerate(prompts[:5]):
benchmark_decode(p, i, is_warmup=True)
print("Warmup done!\n")
print("=" * 50)
print("Test...")
speeds = []
for i, p in enumerate(prompts):
s = benchmark_decode(p, i)
if s > 0: speeds.append(s)
if speeds:
avg_speed = sum(speeds) / len(speeds)
print("=" * 50)
print(f"Decode Speed: {avg_speed:.2f} tokens/s")
print("=" * 50)