|
10 | 10 | "name": "stdout", |
11 | 11 | "output_type": "stream", |
12 | 12 | "text": [ |
13 | | - "/home/aerdem/projects/logits-processor-zoo\n" |
| 13 | + "/home/aerdem/projects/nvidia/logits-processor-zoo\n" |
14 | 14 | ] |
15 | 15 | } |
16 | 16 | ], |
|
25 | 25 | "metadata": {}, |
26 | 26 | "outputs": [ |
27 | 27 | { |
28 | | - "name": "stderr", |
| 28 | + "name": "stdout", |
29 | 29 | "output_type": "stream", |
30 | 30 | "text": [ |
31 | | - "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", |
32 | | - " warnings.warn(\n" |
| 31 | + "WARNING 12-19 10:37:26 config.py:1563] Casting torch.bfloat16 to torch.float16.\n", |
| 32 | + "INFO 12-19 10:37:26 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='google/gemma-1.1-2b-it', speculative_config=None, tokenizer='google/gemma-1.1-2b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=google/gemma-1.1-2b-it, use_v2_block_manager=False, enable_prefix_caching=False)\n", |
| 33 | + "INFO 12-19 10:37:27 model_runner.py:879] Starting to load model google/gemma-1.1-2b-it...\n", |
| 34 | + "INFO 12-19 10:37:28 weight_utils.py:236] Using model weights format ['*.safetensors']\n" |
33 | 35 | ] |
34 | 36 | }, |
| 37 | + { |
| 38 | + "data": { |
| 39 | + "application/vnd.jupyter.widget-view+json": { |
| 40 | + "model_id": "243efc7aaada47fd82cc1043c275f03d", |
| 41 | + "version_major": 2, |
| 42 | + "version_minor": 0 |
| 43 | + }, |
| 44 | + "text/plain": [ |
| 45 | + "Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]\n" |
| 46 | + ] |
| 47 | + }, |
| 48 | + "metadata": {}, |
| 49 | + "output_type": "display_data" |
| 50 | + }, |
35 | 51 | { |
36 | 52 | "name": "stdout", |
37 | 53 | "output_type": "stream", |
38 | 54 | "text": [ |
39 | | - "WARNING 07-23 11:04:22 config.py:1222] Casting torch.bfloat16 to torch.float16.\n", |
40 | | - "INFO 07-23 11:04:22 llm_engine.py:161] Initializing an LLM engine (v0.5.0.post1) with config: model='google/gemma-1.1-2b-it', speculative_config=None, tokenizer='google/gemma-1.1-2b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=google/gemma-1.1-2b-it)\n", |
41 | | - "INFO 07-23 11:04:25 weight_utils.py:218] Using model weights format ['*.safetensors']\n", |
42 | | - "INFO 07-23 11:04:27 model_runner.py:160] Loading model weights took 4.6720 GB\n", |
43 | | - "INFO 07-23 11:04:28 gpu_executor.py:83] # GPU blocks: 52902, # CPU blocks: 14563\n" |
| 55 | + "INFO 12-19 10:37:30 model_runner.py:890] Loading model weights took 4.6720 GB\n", |
| 56 | + "INFO 12-19 10:37:32 gpu_executor.py:121] # GPU blocks: 49691, # CPU blocks: 14563\n" |
44 | 57 | ] |
45 | 58 | } |
46 | 59 | ], |
|
0 commit comments