llama.cpp/llama31_python.py at master · louislichen/llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import transformers
import torch
import time
import threading
from transformers import TextIteratorStreamer

model_id = "/data1/cl/weights/llama3.1_8b_instruct_st"
DEVICE_MAP = "auto"
DTYPE = torch.bfloat16


tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": DTYPE},
    device_map=DEVICE_MAP,
)

prompts = [
    "Write a detailed 800-word sci-fi story about AI on Mars.",
    "Create a comprehensive timeline of WWII with 20 events.",
    "Write a Python Snake game using Pygame with comments.",
    "Guide on building a wooden bookshelf from scratch.",
    "Room with 3 killers puzzle. Explain step-by-step.",
    "10 mins to dry 1 shirt, how long for 50? Explain.",
    "Philosophy: Free Will vs Determinism.",
    "Generate a JSON dataset of 10 users.",
    "Summarize the Transformer architecture paper (Attention Is All You Need).",
    "Translate 'The Transformer architecture relies entirely on attention' to Chinese."
]

def benchmark_decode(prompt, index, is_warmup=False):
    messages = [{"role": "user", "content": prompt}]

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        text_inputs=messages,
        streamer=streamer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.6,
    )

    thread = threading.Thread(target=pipeline, kwargs=generation_kwargs)

    torch.cuda.synchronize()

    thread.start()

    first_token_time = None
    generated_text = ""

    for new_text in streamer:
        if first_token_time is None:
            torch.cuda.synchronize()
            first_token_time = time.time()
        generated_text += new_text

    torch.cuda.synchronize()
    end_time = time.time()
    thread.join()

    output_tokens = len(tokenizer.encode(generated_text, add_special_tokens=False))

    if output_tokens > 1 and first_token_time:
        duration = end_time - first_token_time
        speed = (output_tokens - 1) / duration
    else:
        speed = 0

    if not is_warmup:
        print(f"Decode Speed: {speed:.2f} tokens/s")
        return speed
    return 0

print("Warmup...")
for i, p in enumerate(prompts[:5]):
    benchmark_decode(p, i, is_warmup=True)
print("Warmup done!\n")

print("=" * 50)
print("Test...")

speeds = []
for i, p in enumerate(prompts):
    s = benchmark_decode(p, i)
    if s > 0: speeds.append(s)

if speeds:
    avg_speed = sum(speeds) / len(speeds)
    print("=" * 50)
    print(f"Decode Speed: {avg_speed:.2f} tokens/s")
    print("=" * 50)