-
Notifications
You must be signed in to change notification settings - Fork 15.4k
Open
Description
Reproducer: https://godbolt.org/z/vdzs9nYWb
It's short enough to reproduce here, input:
#include <arm_neon.h>
float32x4_t dot_a_few(bfloat16x8_t a, bfloat16x8_t b) {
float32x4_t result = vdupq_n_f32(0.0f);
result = vbfdotq_laneq_f32(result, a, b, 0);
result = vbfdotq_laneq_f32(result, a, b, 1);
result = vbfdotq_laneq_f32(result, a, b, 2);
result = vbfdotq_laneq_f32(result, a, b, 3);
return result;
}
float32x4_t dot_a_few(bfloat16x4_t a, bfloat16x8_t b) {
float32x4_t result = vdupq_n_f32(0.0f);
result = vbfdotq_lane_f32(result, b, a, 0);
result = vbfdotq_lane_f32(result, b, a, 1);
return result;
}
Output:
dot_a_few(__Bfloat16x8_t, __Bfloat16x8_t):
movi v2.2d, #0000000000000000
dup v3.4s, v1.s[0]
bfdot v2.4s, v0.8h, v3.8h
dup v3.4s, v1.s[1]
bfdot v2.4s, v0.8h, v3.8h
dup v3.4s, v1.s[2]
dup v1.4s, v1.s[3]
bfdot v2.4s, v0.8h, v3.8h
bfdot v2.4s, v0.8h, v1.8h
mov v0.16b, v2.16b
ret
dot_a_few(__Bfloat16x4_t, __Bfloat16x8_t):
movi v2.2d, #0000000000000000
bfdot v2.4s, v1.8h, v0.2h[0]
bfdot v2.4s, v1.8h, v0.2h[1]
mov v0.16b, v2.16b
ret
Note that the vbfdotq_lane_f32 works, and generates an indexed bfdot, but vbfdotq_laneq_f32 does not, it's generating explicit dup instructions.