Skip to content

[AArch64] vbfdotq_laneq_f32 not generating indexed bfdot #170883

@dsharlet

Description

@dsharlet

Reproducer: https://godbolt.org/z/vdzs9nYWb

It's short enough to reproduce here, input:

#include <arm_neon.h>

float32x4_t dot_a_few(bfloat16x8_t a, bfloat16x8_t b) {
    float32x4_t result = vdupq_n_f32(0.0f);
    result = vbfdotq_laneq_f32(result, a, b, 0);
    result = vbfdotq_laneq_f32(result, a, b, 1);
    result = vbfdotq_laneq_f32(result, a, b, 2);
    result = vbfdotq_laneq_f32(result, a, b, 3);
    return result;
}

float32x4_t dot_a_few(bfloat16x4_t a, bfloat16x8_t b) {
    float32x4_t result = vdupq_n_f32(0.0f);
    result = vbfdotq_lane_f32(result, b, a, 0);
    result = vbfdotq_lane_f32(result, b, a, 1);
    return result;
}

Output:

dot_a_few(__Bfloat16x8_t, __Bfloat16x8_t):
        movi    v2.2d, #0000000000000000
        dup     v3.4s, v1.s[0]
        bfdot   v2.4s, v0.8h, v3.8h
        dup     v3.4s, v1.s[1]
        bfdot   v2.4s, v0.8h, v3.8h
        dup     v3.4s, v1.s[2]
        dup     v1.4s, v1.s[3]
        bfdot   v2.4s, v0.8h, v3.8h
        bfdot   v2.4s, v0.8h, v1.8h
        mov     v0.16b, v2.16b
        ret

dot_a_few(__Bfloat16x4_t, __Bfloat16x8_t):
        movi    v2.2d, #0000000000000000
        bfdot   v2.4s, v1.8h, v0.2h[0]
        bfdot   v2.4s, v1.8h, v0.2h[1]
        mov     v0.16b, v2.16b
        ret

Note that the vbfdotq_lane_f32 works, and generates an indexed bfdot, but vbfdotq_laneq_f32 does not, it's generating explicit dup instructions.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions