Received error running ConvNextV2ForImageClassification(
(convnextv2): ConvNextV2Model(
(embeddings): ConvNextV2Embeddings(
(patch_embeddings): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
(layernorm): ConvNextV2LayerNorm()
)
(encoder): ConvNextV2Encoder(
(stages): ModuleList(
(0): ConvNextV2Stage(
(downsampling_layer): Identity()
(layers): Sequential(
(0): ConvNextV2Layer(
(dwconv): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=96, out_features=384, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=384, out_features=96, bias=True)
(drop_path): Identity()
)
(1): ConvNextV2Layer(
(dwconv): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=96, out_features=384, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=384, out_features=96, bias=True)
(drop_path): Identity()
)
(2): ConvNextV2Layer(
(dwconv): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=96, out_features=384, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=384, out_features=96, bias=True)
(drop_path): Identity()
)
)
)
(1): ConvNextV2Stage(
(downsampling_layer): Sequential(
(0): ConvNextV2LayerNorm()
(1): Conv2d(96, 192, kernel_size=(2, 2), stride=(2, 2))
)
(layers): Sequential(
(0): ConvNextV2Layer(
(dwconv): Conv2d(192, 192, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=192)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=192, out_features=768, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=768, out_features=192, bias=True)
(drop_path): Identity()
)
(1): ConvNextV2Layer(
(dwconv): Conv2d(192, 192, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=192)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=192, out_features=768, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=768, out_features=192, bias=True)
(drop_path): Identity()
)
(2): ConvNextV2Layer(
(dwconv): Conv2d(192, 192, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=192)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=192, out_features=768, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=768, out_features=192, bias=True)
(drop_path): Identity()
)
)
)
(2): ConvNextV2Stage(
(downsampling_layer): Sequential(
(0): ConvNextV2LayerNorm()
(1): Conv2d(192, 384, kernel_size=(2, 2), stride=(2, 2))
)
(layers): Sequential(
(0): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(1): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(2): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(3): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(4): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(5): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(6): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(7): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
(8): ConvNextV2Layer(
(dwconv): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=1536, out_features=384, bias=True)
(drop_path): Identity()
)
)
)
(3): ConvNextV2Stage(
(downsampling_layer): Sequential(
(0): ConvNextV2LayerNorm()
(1): Conv2d(384, 768, kernel_size=(2, 2), stride=(2, 2))
)
(layers): Sequential(
(0): ConvNextV2Layer(
(dwconv): Conv2d(768, 768, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=768)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=3072, out_features=768, bias=True)
(drop_path): Identity()
)
(1): ConvNextV2Layer(
(dwconv): Conv2d(768, 768, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=768)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=3072, out_features=768, bias=True)
(drop_path): Identity()
)
(2): ConvNextV2Layer(
(dwconv): Conv2d(768, 768, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=768)
(layernorm): ConvNextV2LayerNorm()
(pwconv1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELUActivation()
(grn): ConvNextV2GRN()
(pwconv2): Linear(in_features=3072, out_features=768, bias=True)
(drop_path): Identity()
)
)
)
)
)
(layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(classifier): Linear(in_features=768, out_features=1000, bias=True)
): AssertionError: Index names length mismatch: 2 != 3
Describe the bug
Hello,
I have written a simple program to run some vision models from HuggingFace in this gist: https://gist.github.com/oluwatimilehin/db204bcab6eb7531173453a2e4ed7e8e
When I run the script for convnextv2 with a batch size greater than 1, like:
python vision_models.py -m convnextv2 -b 8I get an
AssertionError: Index names length mismatch: 2 != 3message:Could you please look into this? Thanks!