|
28 | 28 | "name": "stdout", |
29 | 29 | "output_type": "stream", |
30 | 30 | "text": [ |
31 | | - "WARNING 04-30 15:00:30 cuda.py:22] You are using a deprecated `pynvml` package. Please install `nvidia-ml-py` instead. See https://pypi.org/project/pynvml for more information.\n" |
32 | | - ] |
33 | | - }, |
34 | | - { |
35 | | - "name": "stderr", |
36 | | - "output_type": "stream", |
37 | | - "text": [ |
38 | | - "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", |
39 | | - " warnings.warn(\n" |
40 | | - ] |
41 | | - }, |
42 | | - { |
43 | | - "name": "stdout", |
44 | | - "output_type": "stream", |
45 | | - "text": [ |
46 | | - "WARNING 04-30 15:00:33 config.py:1563] Casting torch.bfloat16 to torch.float16.\n", |
47 | | - "INFO 04-30 15:00:33 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, use_v2_block_manager=False, enable_prefix_caching=False)\n" |
48 | | - ] |
49 | | - }, |
50 | | - { |
51 | | - "name": "stderr", |
52 | | - "output_type": "stream", |
53 | | - "text": [ |
54 | | - "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" |
55 | | - ] |
56 | | - }, |
57 | | - { |
58 | | - "name": "stdout", |
59 | | - "output_type": "stream", |
60 | | - "text": [ |
61 | | - "INFO 04-30 15:00:34 model_runner.py:879] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n", |
62 | | - "INFO 04-30 15:00:34 weight_utils.py:236] Using model weights format ['*.safetensors']\n", |
63 | | - "INFO 04-30 15:00:35 weight_utils.py:280] No model.safetensors.index.json found in remote.\n" |
| 31 | + "INFO 05-22 14:12:47 [__init__.py:239] Automatically detected platform cuda.\n", |
| 32 | + "WARNING 05-22 14:12:50 [config.py:2972] Casting torch.bfloat16 to torch.float16.\n", |
| 33 | + "INFO 05-22 14:12:55 [config.py:717] This model supports multiple tasks: {'reward', 'generate', 'classify', 'score', 'embed'}. Defaulting to 'generate'.\n", |
| 34 | + "WARNING 05-22 14:12:55 [cuda.py:93] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used\n", |
| 35 | + "INFO 05-22 14:12:55 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"splitting_ops\":[],\"compile_sizes\":[],\"cudagraph_capture_sizes\":[],\"max_capture_size\":0}, use_cached_outputs=False, \n", |
| 36 | + "INFO 05-22 14:12:56 [cuda.py:292] Using Flash Attention backend.\n", |
| 37 | + "INFO 05-22 14:12:57 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n", |
| 38 | + "INFO 05-22 14:12:57 [model_runner.py:1108] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n", |
| 39 | + "INFO 05-22 14:12:57 [weight_utils.py:265] Using model weights format ['*.safetensors']\n", |
| 40 | + "INFO 05-22 14:12:58 [weight_utils.py:315] No model.safetensors.index.json found in remote.\n" |
64 | 41 | ] |
65 | 42 | }, |
66 | 43 | { |
67 | 44 | "data": { |
68 | 45 | "application/vnd.jupyter.widget-view+json": { |
69 | | - "model_id": "e9c350b056a04694bf4f2eade35244ba", |
| 46 | + "model_id": "d29121d7259a47f5923ef4d1b3fa3138", |
70 | 47 | "version_major": 2, |
71 | 48 | "version_minor": 0 |
72 | 49 | }, |
|
81 | 58 | "name": "stdout", |
82 | 59 | "output_type": "stream", |
83 | 60 | "text": [ |
84 | | - "INFO 04-30 15:00:36 model_runner.py:890] Loading model weights took 2.8875 GB\n", |
85 | | - "INFO 04-30 15:00:38 gpu_executor.py:121] # GPU blocks: 37541, # CPU blocks: 9362\n" |
| 61 | + "INFO 05-22 14:12:58 [loader.py:458] Loading weights took 0.57 seconds\n", |
| 62 | + "INFO 05-22 14:12:58 [model_runner.py:1140] Model loading took 2.8876 GiB and 1.613375 seconds\n", |
| 63 | + "INFO 05-22 14:13:00 [worker.py:287] Memory profiling takes 1.73 seconds\n", |
| 64 | + "INFO 05-22 14:13:00 [worker.py:287] the current vLLM instance can use total_gpu_memory (23.66GiB) x gpu_memory_utilization (0.90) = 21.29GiB\n", |
| 65 | + "INFO 05-22 14:13:00 [worker.py:287] model weights take 2.89GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 16.32GiB.\n", |
| 66 | + "INFO 05-22 14:13:00 [executor_base.py:112] # cuda blocks: 38207, # CPU blocks: 9362\n", |
| 67 | + "INFO 05-22 14:13:00 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 18.66x\n", |
| 68 | + "INFO 05-22 14:13:02 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 3.24 seconds\n" |
86 | 69 | ] |
87 | 70 | } |
88 | 71 | ], |
|
93 | 76 | "\n", |
94 | 77 | "example_prompts =[\n", |
95 | 78 | " \"\"\"\n", |
96 | | - " A user review: very soft, colorful, expensive but deserves its price.\n", |
97 | | - " I would like to wear it in my friend's wedding.\n", |
| 79 | + " A user review: very soft, colorful, expensive but deserves its price, stylish.\n", |
98 | 80 | " \n", |
99 | 81 | " What is the user's opinion about the product's price?\n", |
100 | 82 | " \"\"\",\n", |
|
130 | 112 | "output_type": "stream", |
131 | 113 | "text": [ |
132 | 114 | "Prompt: \n", |
133 | | - " A user review: very soft, colorful, expensive but deserves its price.\n", |
134 | | - " I would like to wear it in my friend's wedding.\n", |
| 115 | + " A user review: very soft, colorful, expensive but deserves its price, stylish.\n", |
135 | 116 | " \n", |
136 | 117 | " What is the user's opinion about the product's price?\n", |
137 | 118 | " \n", |
138 | | - "The user's opinion about the product's price is that it is expensive, but they believe it is worth the price.\n", |
| 119 | + "The user's opinion about the product's price is that it is expensive, but they believe it is worth the price due to its softness, colorfulness, and stylish design.\n", |
139 | 120 | "-----END-----\n", |
140 | 121 | "\n", |
141 | 122 | "Prompt: \n", |
|
175 | 156 | "output_type": "stream", |
176 | 157 | "text": [ |
177 | 158 | "Prompt: \n", |
178 | | - " A user review: very soft, colorful, expensive but deserves its price.\n", |
179 | | - " I would like to wear it in my friend's wedding.\n", |
| 159 | + " A user review: very soft, colorful, expensive but deserves its price, stylish.\n", |
180 | 160 | " \n", |
181 | 161 | " What is the user's opinion about the product's price?\n", |
182 | 162 | " \n", |
183 | | - "The user's opinion about the product's price is that it is expensive, but the user is willing to pay the price to wear it in a friend's wedding.\n", |
| 163 | + "The user's opinion about the product's price is that it is expensive but deserves its price, stylish.\n", |
184 | 164 | "-----END-----\n", |
185 | 165 | "\n", |
186 | 166 | "Prompt: \n", |
|
191 | 171 | " \n", |
192 | 172 | " Can you shortly describe what Pokémon is?\n", |
193 | 173 | " \n", |
194 | | - " Pokémon is a Japanese media franchise consisting of video games, animated series, and films. The franchise takes place in a shared universe in which humans co-exist with Pokémon, a large variety of species endowed with special powers. The franchise's target audience is children aged 5 to 12, but it is known to attract people of all ages.\n", |
| 174 | + "Pokémon is a Japanese media franchise consisting of video games, animated series and films, a trading card game, and other related media. The franchise takes place in a shared universe in which humans co-exist with creatures known as Pokémon, a large variety of species endowed with special powers. The franchise's target audience is children aged 5 to 12, but it is known to attract people of all ages.\n", |
195 | 175 | "-----END-----\n", |
196 | 176 | "\n" |
197 | 177 | ] |
198 | 178 | } |
199 | 179 | ], |
200 | 180 | "source": [ |
201 | 181 | "runner.generate_response(example_prompts,\n", |
202 | | - " [CiteFromPromptLogitsProcessor(runner.tokenizer, boost_factor=5.0, boost_eos=False)])" |
| 182 | + " [CiteFromPromptLogitsProcessor(runner.tokenizer, boost_factor=1.0, boost_eos=False,\n", |
| 183 | + " conditional_boost_factor=3.0)])" |
203 | 184 | ] |
204 | 185 | }, |
205 | 186 | { |
|
221 | 202 | "output_type": "stream", |
222 | 203 | "text": [ |
223 | 204 | "Prompt: \n", |
224 | | - " A user review: very soft, colorful, expensive but deserves its price.\n", |
225 | | - " I would like to wear it in my friend's wedding.\n", |
| 205 | + " A user review: very soft, colorful, expensive but deserves its price, stylish.\n", |
226 | 206 | " \n", |
227 | 207 | " What is the user's opinion about the product's price?\n", |
228 | 208 | " \n", |
229 | | - "The user's opinion about the product's price seems to be mixed. They appreciate that the product is \"very soft\" and \"colorful,\" indicating that these features contribute positively to their satisfaction with the item. However, they also mention that the product is \"expensive,\" which suggests that they feel the price is justified based on the quality they perceive.\n", |
230 | | - "\n", |
231 | | - "The phrase \"deserves its price\" implies that the user believes the cost of the product is appropriate for what they have received. This indicates that they find value in the product and feel that they are getting good value for their money.\n", |
232 | | - "\n", |
233 | | - "In summary, while the user appreciates the product's qualities and finds them worth the price, they also acknowledge that the cost is higher than they might have expected for such features. This suggests that they view the product as a good investment for their needs and preferences.\n", |
| 209 | + "The user's opinion about the product's price is that it is expensive, but they believe it is worth the cost due to its softness, colorfulness, and style.\n", |
234 | 210 | "-----END-----\n", |
235 | 211 | "\n", |
236 | 212 | "Prompt: \n", |
|
241 | 217 | " \n", |
242 | 218 | " Can you shortly describe what Pokémon is?\n", |
243 | 219 | " \n", |
244 | | - "Pokémon is a popular Japanese media franchise that features a world where humans live alongside magical creatures called Pokémon. These Pokémon have unique abilities that allow them to fight alongside humans in various adventures. The franchise includes video games, animated series, films, trading cards, and other forms of media aimed at children aged 5 to 12, though it has also gained popularity among adults.\n", |
| 220 | + "Pokémon is a popular Japanese media franchise that features a world where humans live alongside Pokémon, mythical creatures with unique abilities. It targets children aged 5 to 12 but has broad appeal across all age groups.\n", |
245 | 221 | "-----END-----\n", |
246 | 222 | "\n" |
247 | 223 | ] |
248 | 224 | } |
249 | 225 | ], |
250 | 226 | "source": [ |
251 | 227 | "runner.generate_response(example_prompts,\n", |
252 | | - " [CiteFromPromptLogitsProcessor(runner.tokenizer, boost_factor=-2.0, boost_eos=False)])" |
| 228 | + " [CiteFromPromptLogitsProcessor(runner.tokenizer, boost_factor=-1.0, boost_eos=False,\n", |
| 229 | + " conditional_boost_factor=-1.0)])" |
253 | 230 | ] |
254 | 231 | }, |
255 | 232 | { |
|
0 commit comments