-
Notifications
You must be signed in to change notification settings - Fork 38
Open
Description
Hi,
It seems the package cannot calculate TFlops for training a multimodal model like meta-llama/Llama-3.2-11B-Vision-Instruct. Could you please add this functionality?
Here is my code:
# Transformers Model, such as bert.
import calflops
from calflops import calculate_flops
from transformers import AutoModel, MllamaForConditionalGeneration
from transformers import AutoTokenizer
batch_size, max_seq_length = 2, 4096
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
flops, macs, params = calculate_flops(model=model,
input_shape=(batch_size,max_seq_length),
transformer_tokenizer=tokenizer,
include_backPropagation=True)
print("meta-llama/Llama-3.2-11B-Vision-Instruct FLOPs:%s MACs:%s Params:%s \n" %(flops, macs, params))
Here is the output (which seems to only calculated the Text part flops):
Loading checkpoint shards: 100%|██████████| 5/5 [01:07<00:00, 13.52s/it]
/opt/conda/envs/py_3.10/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2716: FutureWarning: The `truncation_strategy` argument is deprecated and will be removed in a future version, use `truncation=True` to truncate examples to a max length. You can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific truncation strategy selected among `truncation='only_first'` (will only truncate the first sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).
warnings.warn(
------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.
Total Training Params: 10.67 B
fwd MACs: 122.96 TMACs
fwd FLOPs: 245.92 TFLOPS
fwd+bwd MACs: 368.87 TMACs
fwd+bwd FLOPs: 737.76 TFLOPS
-------------------------------- Detailed Calculated FLOPs Results --------------------------------
Each module caculated is listed after its name in the following order:
params, percentage of total params, MACs, percentage of total MACs, FLOPS, percentage of total FLOPs
Note: 1. A module can have torch.nn.module or torch.nn.functional to compute logits (e.g. CrossEntropyLoss).
They are not counted as submodules in calflops and not to be printed out. However they make up the difference between a parent's MACs and the sum of its submodules'.
2. Number of floating-point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
MllamaForConditionalGeneration(
10.67 B = 100% Params, 122.96 TMACs = 100% MACs, 245.92 TFLOPS = 100% FLOPs
(vision_model): MllamaVisionModel(
863.57 M = 8.09% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(patch_embedding): Conv2d(752.64 K = 0.01% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, 3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
(gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
75.82 M = 0.71% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(tile_embedding): Embedding(73.77 M = 0.69% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, 9, 8197120)
)
(pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
46.08 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(embedding): Embedding(46.08 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, 9, 5120)
)
(post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
46.08 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(embedding): Embedding(46.08 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, 9, 5120)
)
(layernorm_pre): LayerNorm(2.56 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (1280,), eps=1e-05, elementwise_affine=True)
(layernorm_post): LayerNorm(2.56 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (1280,), eps=1e-05, elementwise_affine=True)
(transformer): MllamaVisionEncoder(
629.51 M = 5.9% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(layers): ModuleList(
(0-31): 32 x MllamaVisionEncoderLayer(
19.67 M = 0.18% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(self_attn): MllamaVisionSdpaAttention(
6.55 M = 0.06% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
(k_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
(v_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
(o_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
)
(mlp): MllamaVisionMLP(
13.11 M = 0.12% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(activation_fn): GELUActivation(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
(fc1): Linear(6.56 M = 0.06% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=5120, bias=True)
(fc2): Linear(6.55 M = 0.06% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=5120, out_features=1280, bias=True)
)
(input_layernorm): LayerNorm(2.56 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (1280,), eps=1e-05, elementwise_affine=True)
(post_attention_layernorm): LayerNorm(2.56 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (1280,), eps=1e-05, elementwise_affine=True)
)
)
)
(global_transformer): MllamaVisionEncoder(
157.38 M = 1.47% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(layers): ModuleList(
(0-7): 8 x MllamaVisionEncoderLayer(
19.67 M = 0.18% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(self_attn): MllamaVisionSdpaAttention(
6.55 M = 0.06% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
(k_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
(v_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
(o_proj): Linear(1.64 M = 0.02% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=1280, bias=False)
)
(mlp): MllamaVisionMLP(
13.11 M = 0.12% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(activation_fn): GELUActivation(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
(fc1): Linear(6.56 M = 0.06% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=1280, out_features=5120, bias=True)
(fc2): Linear(6.55 M = 0.06% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=5120, out_features=1280, bias=True)
)
(input_layernorm): LayerNorm(2.56 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (1280,), eps=1e-05, elementwise_affine=True)
(post_attention_layernorm): LayerNorm(2.56 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (1280,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(language_model): MllamaForCausalLM(
9.78 B = 91.61% Params, 122.96 TMACs = 100% MACs, 245.92 TFLOPS = 100% FLOPs
(model): MllamaTextModel(
9.25 B = 86.69% Params, 114.35 TMACs = 93% MACs, 228.71 TFLOPS = 93% FLOPs
(embed_tokens): Embedding(525.37 M = 4.92% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, 128264, 4096, padding_idx=128004)
(layers): ModuleList(
(0-2): 3 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(3): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(4-7): 4 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(8): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(9-12): 4 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(13): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(14-17): 4 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(18): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(19-22): 4 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(23): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(24-27): 4 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(28): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(29-32): 4 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(33): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(34-37): 4 x MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(38): MllamaCrossAttentionDecoderLayer(
218.11 M = 2.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(cross_attn): MllamaTextCrossSdpaAttention(
41.94 M = 0.39% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=4096, bias=False)
(q_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
(k_norm): MllamaTextRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (128,), eps=1e-05)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
(39): MllamaSelfAttentionDecoderLayer(
218.11 M = 2.04% Params, 3.57 TMACs = 2.91% MACs, 7.15 TFLOPS = 2.91% FLOPs
(self_attn): MllamaTextSelfSdpaAttention(
41.94 M = 0.39% Params, 687.19 GMACs = 0.56% MACs, 1.37 TFLOPS = 0.56% FLOPs
(q_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(v_proj): Linear(4.19 M = 0.04% Params, 68.72 GMACs = 0.06% MACs, 137.44 GFLOPS = 0.06% FLOPs, in_features=4096, out_features=1024, bias=False)
(o_proj): Linear(16.78 M = 0.16% Params, 274.88 GMACs = 0.22% MACs, 549.76 GFLOPS = 0.22% FLOPs, in_features=4096, out_features=4096, bias=False)
)
(mlp): MllamaTextMLP(
176.16 M = 1.65% Params, 2.89 TMACs = 2.35% MACs, 5.77 TFLOPS = 2.35% FLOPs
(gate_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(up_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=4096, out_features=14336, bias=False)
(down_proj): Linear(58.72 M = 0.55% Params, 962.07 GMACs = 0.78% MACs, 1.92 TFLOPS = 0.78% FLOPs, in_features=14336, out_features=4096, bias=False)
(act_fn): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 234.88 MFLOPS = 0% FLOPs)
)
(input_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(post_attention_layernorm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
)
)
(norm): MllamaTextRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, (4096,), eps=1e-05)
(rotary_emb): MllamaRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs)
)
(lm_head): Linear(525.34 M = 4.92% Params, 8.61 TMACs = 7% MACs, 17.21 TFLOPS = 7% FLOPs, in_features=4096, out_features=128256, bias=False)
)
(multi_modal_projector): Linear(31.46 M = 0.29% Params, 0 MACs = 0% MACs, 0 FLOPS = 0% FLOPs, in_features=7680, out_features=4096, bias=True)
)
---------------------------------------------------------------------------------------------------
meta-llama/Llama-3.2-11B-Vision-Instruct FLOPs:737.76 TFLOPS MACs:368.87 TMACs Params:10.67 B
Metadata
Metadata
Assignees
Labels
No labels