@@ -184,7 +184,7 @@ Now that the model is saved to the disk, we can see that its size is about 34.1G
184184
185185We can load the quantized model back without the max_memory limit:
186186``` python
187- tokenizer = AutoTokenizer.from_pretrained(save_folder)
187+ tokenizer_q = AutoTokenizer.from_pretrained(save_folder)
188188model_quantized = AutoModelForCausalLM.from_pretrained(
189189 save_folder,
190190 device_map = " auto" ,
@@ -282,3 +282,34 @@ By checking the device map, we see the entire model is loaded into GPUs:
282282</details >
283283
284284## Testing the model
285+ We can use the Huggingface pipeline to test the model inference.
286+
287+ ``` python
288+ import time
289+ from transformers import pipeline
290+
291+ def generate (prompt , model , tokenizer , ** kwargs ):
292+ """ Creates a text generation pipeline, generate the completion and track the time used for the generation."""
293+ generator = pipeline(" text-generation" , model = model, tokenizer = tokenizer, max_new_tokens = 256 , return_full_text = False )
294+
295+ # warm up
296+ generator(" How are you?" )
297+ generator(" Oracle is a great company." )
298+
299+ time_started = time.time()
300+ completion = generator(prompt)[0 ][' generated_text' ]
301+ seconds_used = time.time() - time_started
302+ print (completion)
303+ per_token = seconds_used / len (generator.tokenizer(completion)[" input_ids" ])
304+ print (f " ****** \n Time used: { seconds_used:.3f } seconds, { per_token:.3f } s/token " )
305+ ```
306+
307+ Test the full model:
308+ generate("What's LLM quantization?", model_full, tokenizer)
309+ Output:
310+ TBA
311+
312+ Test the quantized model:
313+ generate("What's LLM quantization?", model_quantized, tokenizer_q)
314+ Output:
315+ TBA
0 commit comments