Received error running MobileViTV2ForImageClassification(
(mobilevitv2): MobileViTV2Model(
(conv_stem): MobileViTV2ConvLayer(
(convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(normalization): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(encoder): MobileViTV2Encoder(
(layer): ModuleList(
(0): MobileViTV2MobileNetLayer(
(layer): ModuleList(
(0): MobileViTV2InvertedResidual(
(expand_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_3x3): MobileViTV2ConvLayer(
(convolution): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
(normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(reduce_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
)
(1): MobileViTV2MobileNetLayer(
(layer): ModuleList(
(0): MobileViTV2InvertedResidual(
(expand_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_3x3): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=128, bias=False)
(normalization): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(reduce_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): MobileViTV2InvertedResidual(
(expand_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_3x3): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
(normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(reduce_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
)
(2): MobileViTV2Layer(
(downsampling_layer): MobileViTV2InvertedResidual(
(expand_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_3x3): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256, bias=False)
(normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(reduce_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(conv_kxk): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
(normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
)
(transformer): MobileViTV2Transformer(
(layer): ModuleList(
(0-1): 2 x MobileViTV2TransformerLayer(
(layernorm_before): GroupNorm(1, 128, eps=1e-05, affine=True)
(attention): MobileViTV2LinearSelfAttention(
(qkv_proj): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 257, kernel_size=(1, 1), stride=(1, 1))
)
(attn_dropout): Dropout(p=0.0, inplace=False)
(out_proj): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))
)
)
(dropout1): Dropout(p=0.0, inplace=False)
(layernorm_after): GroupNorm(1, 128, eps=1e-05, affine=True)
(ffn): MobileViTV2FFN(
(conv1): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
(activation): SiLU()
)
(dropout1): Dropout(p=0.0, inplace=False)
(conv2): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
)
(dropout2): Dropout(p=0.0, inplace=False)
)
)
)
)
(layernorm): GroupNorm(1, 128, eps=1e-05, affine=True)
(conv_projection): MobileViTV2ConvLayer(
(convolution): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(3): MobileViTV2Layer(
(downsampling_layer): MobileViTV2InvertedResidual(
(expand_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_3x3): MobileViTV2ConvLayer(
(convolution): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=512, bias=False)
(normalization): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(reduce_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(512, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(conv_kxk): MobileViTV2ConvLayer(
(convolution): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(normalization): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
)
(transformer): MobileViTV2Transformer(
(layer): ModuleList(
(0-3): 4 x MobileViTV2TransformerLayer(
(layernorm_before): GroupNorm(1, 192, eps=1e-05, affine=True)
(attention): MobileViTV2LinearSelfAttention(
(qkv_proj): MobileViTV2ConvLayer(
(convolution): Conv2d(192, 385, kernel_size=(1, 1), stride=(1, 1))
)
(attn_dropout): Dropout(p=0.0, inplace=False)
(out_proj): MobileViTV2ConvLayer(
(convolution): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1))
)
)
(dropout1): Dropout(p=0.0, inplace=False)
(layernorm_after): GroupNorm(1, 192, eps=1e-05, affine=True)
(ffn): MobileViTV2FFN(
(conv1): MobileViTV2ConvLayer(
(convolution): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1))
(activation): SiLU()
)
(dropout1): Dropout(p=0.0, inplace=False)
(conv2): MobileViTV2ConvLayer(
(convolution): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1))
)
(dropout2): Dropout(p=0.0, inplace=False)
)
)
)
)
(layernorm): GroupNorm(1, 192, eps=1e-05, affine=True)
(conv_projection): MobileViTV2ConvLayer(
(convolution): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(4): MobileViTV2Layer(
(downsampling_layer): MobileViTV2InvertedResidual(
(expand_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_3x3): MobileViTV2ConvLayer(
(convolution): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
(normalization): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(reduce_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(768, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(conv_kxk): MobileViTV2ConvLayer(
(convolution): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512, bias=False)
(normalization): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activation): SiLU()
)
(conv_1x1): MobileViTV2ConvLayer(
(convolution): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
)
(transformer): MobileViTV2Transformer(
(layer): ModuleList(
(0-2): 3 x MobileViTV2TransformerLayer(
(layernorm_before): GroupNorm(1, 256, eps=1e-05, affine=True)
(attention): MobileViTV2LinearSelfAttention(
(qkv_proj): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 513, kernel_size=(1, 1), stride=(1, 1))
)
(attn_dropout): Dropout(p=0.0, inplace=False)
(out_proj): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
)
)
(dropout1): Dropout(p=0.0, inplace=False)
(layernorm_after): GroupNorm(1, 256, eps=1e-05, affine=True)
(ffn): MobileViTV2FFN(
(conv1): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
(activation): SiLU()
)
(dropout1): Dropout(p=0.0, inplace=False)
(conv2): MobileViTV2ConvLayer(
(convolution): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
)
(dropout2): Dropout(p=0.0, inplace=False)
)
)
)
)
(layernorm): GroupNorm(1, 256, eps=1e-05, affine=True)
(conv_projection): MobileViTV2ConvLayer(
(convolution): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(normalization): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
)
)
(classifier): Linear(in_features=512, out_features=2, bias=True)
): AssertionError:
Describe the bug
Hello,
I have written a simple program to run some vision models from HuggingFace in this gist: https://gist.github.com/oluwatimilehin/db204bcab6eb7531173453a2e4ed7e8e
When I run the script for mobilevit2 with any batch size, like:
python vision_models.py -m mobilevit2 -b 1I get an Assertion error with no further information:
Could you please look into this? Thanks!