Skip to content

Commit a3cacc5

Browse files
[0.7] Beng arm opt f64 - [MOD-9077] (#647)
* cherry pick for arm opt fp64 * small fix * remove unwanted changes * add benchmarks for arm * format * remove json files * pr changes
1 parent 7858fc8 commit a3cacc5

File tree

14 files changed

+455
-11
lines changed

14 files changed

+455
-11
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
inline void InnerProductStep(double *&pVect1, double *&pVect2, float64x2_t &sum) {
11+
float64x2_t v1 = vld1q_f64(pVect1);
12+
float64x2_t v2 = vld1q_f64(pVect2);
13+
sum = vmlaq_f64(sum, v1, v2);
14+
pVect1 += 2;
15+
pVect2 += 2;
16+
}
17+
18+
template <unsigned char residual> // 0..7
19+
double FP64_InnerProductSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
20+
double *pVect1 = (double *)pVect1v;
21+
double *pVect2 = (double *)pVect2v;
22+
23+
float64x2_t sum0 = vdupq_n_f64(0.0);
24+
float64x2_t sum1 = vdupq_n_f64(0.0);
25+
float64x2_t sum2 = vdupq_n_f64(0.0);
26+
float64x2_t sum3 = vdupq_n_f64(0.0);
27+
28+
const size_t num_of_chunks = dimension / 8;
29+
30+
for (size_t i = 0; i < num_of_chunks; i++) {
31+
InnerProductStep(pVect1, pVect2, sum0);
32+
InnerProductStep(pVect1, pVect2, sum1);
33+
InnerProductStep(pVect1, pVect2, sum2);
34+
InnerProductStep(pVect1, pVect2, sum3);
35+
}
36+
37+
// Handle remaining complete 2-float blocks within residual
38+
constexpr size_t remaining_chunks = residual / 2;
39+
// Unrolled loop for the 2-float blocks
40+
if constexpr (remaining_chunks >= 1) {
41+
InnerProductStep(pVect1, pVect2, sum0);
42+
}
43+
if constexpr (remaining_chunks >= 2) {
44+
InnerProductStep(pVect1, pVect2, sum1);
45+
}
46+
if constexpr (remaining_chunks >= 3) {
47+
InnerProductStep(pVect1, pVect2, sum2);
48+
}
49+
50+
// Handle final residual elements (0-1 elements)
51+
// This entire block is eliminated at compile time if final_residual is 0
52+
constexpr size_t final_residual = residual % 2; // Final 0-1 elements
53+
if constexpr (final_residual == 1) {
54+
float64x2_t v1 = vdupq_n_f64(0.0);
55+
float64x2_t v2 = vdupq_n_f64(0.0);
56+
v1 = vld1q_lane_f64(pVect1, v1, 0);
57+
v2 = vld1q_lane_f64(pVect2, v2, 0);
58+
59+
sum3 = vmlaq_f64(sum3, v1, v2);
60+
}
61+
62+
float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3));
63+
64+
// Horizontal sum of the 4 elements in the NEON register
65+
float64x1_t summed = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined));
66+
double sum = vget_lane_f64(summed, 0);
67+
68+
return 1.0 - sum;
69+
}

src/VecSim/spaces/IP/IP_SVE_FP64.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
9+
#include <arm_sve.h>
10+
11+
inline void InnerProductStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum,
12+
const size_t chunk) {
13+
// Load vectors
14+
svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset);
15+
svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset);
16+
17+
// Multiply-accumulate
18+
sum = svmla_f64_x(svptrue_b64(), sum, v1, v2);
19+
20+
// Advance pointers
21+
offset += chunk;
22+
}
23+
24+
template <bool partial_chunk, unsigned char additional_steps>
25+
double FP64_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
26+
double *pVect1 = (double *)pVect1v;
27+
double *pVect2 = (double *)pVect2v;
28+
const size_t chunk = svcntd();
29+
size_t offset = 0;
30+
31+
// Multiple accumulators to increase instruction-level parallelism
32+
svfloat64_t sum0 = svdup_f64(0.0);
33+
svfloat64_t sum1 = svdup_f64(0.0);
34+
svfloat64_t sum2 = svdup_f64(0.0);
35+
svfloat64_t sum3 = svdup_f64(0.0);
36+
37+
auto chunk_size = 4 * chunk;
38+
size_t number_of_chunks = dimension / chunk_size;
39+
for (size_t i = 0; i < number_of_chunks; i++) {
40+
InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
41+
InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
42+
InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
43+
InnerProductStep(pVect1, pVect2, offset, sum3, chunk);
44+
}
45+
46+
if constexpr (additional_steps >= 1) {
47+
InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
48+
}
49+
if constexpr (additional_steps >= 2) {
50+
InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
51+
}
52+
if constexpr (additional_steps >= 3) {
53+
InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
54+
}
55+
56+
if constexpr (partial_chunk) {
57+
svbool_t pg =
58+
svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
59+
svfloat64_t v1 = svld1_f64(pg, pVect1 + offset);
60+
svfloat64_t v2 = svld1_f64(pg, pVect2 + offset);
61+
sum3 = svmla_f64_m(pg, sum3, v1, v2);
62+
}
63+
64+
// Combine the partial sums
65+
sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1);
66+
sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3);
67+
68+
// Perform vector addition in parallel
69+
svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2);
70+
// Single horizontal reduction at the end
71+
double result = svaddv_f64(svptrue_b64(), sum_all);
72+
return 1.0 - result;
73+
}

src/VecSim/spaces/IP_space.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,14 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
8585
}
8686

8787
dist_func_t<double> ret_dist_func = FP64_InnerProduct;
88-
// Optimizations assume at least 8 doubles. If we have less, we use the naive implementation.
88+
8989
if (dim < 8) {
9090
return ret_dist_func;
9191
}
92+
switch (arch_opt) {
93+
9294
#ifdef CPU_FEATURES_ARCH_X86_64
9395

94-
switch (arch_opt) {
9596
case ARCH_OPT_AVX512_F:
9697
#ifdef OPT_AVX512F
9798
ret_dist_func = Choose_FP64_IP_implementation_AVX512(dim);
@@ -113,11 +114,34 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
113114
*alignment = 2 * sizeof(double); // handles 2 doubles
114115
break;
115116
#endif
117+
#endif // __x86_64__ */
118+
119+
#ifdef CPU_FEATURES_ARCH_AARCH64
120+
case ARCH_OPT_SVE2:
121+
122+
#ifdef OPT_SVE2
123+
ret_dist_func = Choose_FP64_IP_implementation_SVE2(dim);
124+
break;
125+
126+
#endif
127+
case ARCH_OPT_SVE:
128+
129+
#ifdef OPT_SVE
130+
ret_dist_func = Choose_FP64_IP_implementation_SVE(dim);
131+
break;
132+
#endif
133+
case ARCH_OPT_NEON:
134+
135+
#ifdef OPT_NEON
136+
ret_dist_func = Choose_FP64_IP_implementation_NEON(dim);
137+
break;
138+
#endif
139+
140+
#endif // CPU_FEATURES_ARCH_AARCH64
116141
case ARCH_OPT_NONE:
117142
break;
118143
} // switch
119144

120-
#endif // __x86_64__ */
121145
return ret_dist_func;
122146
}
123147

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
inline void L2SquareStep(double *&pVect1, double *&pVect2, float64x2_t &sum) {
11+
float64x2_t v1 = vld1q_f64(pVect1);
12+
float64x2_t v2 = vld1q_f64(pVect2);
13+
14+
// Calculate difference between vectors
15+
float64x2_t diff = vsubq_f64(v1, v2);
16+
17+
// Square and accumulate
18+
sum = vmlaq_f64(sum, diff, diff);
19+
20+
pVect1 += 2;
21+
pVect2 += 2;
22+
}
23+
24+
template <unsigned char residual> // 0..7
25+
double FP64_L2SqrSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
26+
double *pVect1 = (double *)pVect1v;
27+
double *pVect2 = (double *)pVect2v;
28+
29+
float64x2_t sum0 = vdupq_n_f64(0.0);
30+
float64x2_t sum1 = vdupq_n_f64(0.0);
31+
float64x2_t sum2 = vdupq_n_f64(0.0);
32+
float64x2_t sum3 = vdupq_n_f64(0.0);
33+
// These are compile-time constants derived from the template parameter
34+
35+
// Calculate how many full 8-element blocks to process
36+
const size_t num_of_chunks = dimension / 8;
37+
38+
for (size_t i = 0; i < num_of_chunks; i++) {
39+
L2SquareStep(pVect1, pVect2, sum0);
40+
L2SquareStep(pVect1, pVect2, sum1);
41+
L2SquareStep(pVect1, pVect2, sum2);
42+
L2SquareStep(pVect1, pVect2, sum3);
43+
}
44+
45+
// Handle remaining complete 2-float blocks within residual
46+
constexpr size_t remaining_chunks = residual / 2;
47+
// Unrolled loop for the 2-float blocks
48+
if constexpr (remaining_chunks >= 1) {
49+
L2SquareStep(pVect1, pVect2, sum0);
50+
}
51+
if constexpr (remaining_chunks >= 2) {
52+
L2SquareStep(pVect1, pVect2, sum1);
53+
}
54+
if constexpr (remaining_chunks >= 3) {
55+
L2SquareStep(pVect1, pVect2, sum2);
56+
}
57+
58+
// Handle final residual element
59+
constexpr size_t final_residual = residual % 2; // Final element
60+
if constexpr (final_residual > 0) {
61+
float64x2_t v1 = vdupq_n_f64(0.0);
62+
float64x2_t v2 = vdupq_n_f64(0.0);
63+
v1 = vld1q_lane_f64(pVect1, v1, 0);
64+
v2 = vld1q_lane_f64(pVect2, v2, 0);
65+
66+
// Calculate difference and square
67+
float64x2_t diff = vsubq_f64(v1, v2);
68+
sum3 = vmlaq_f64(sum3, diff, diff);
69+
}
70+
71+
float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3));
72+
73+
// Horizontal sum of the 4 elements in the NEON register
74+
float64x1_t sum = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined));
75+
return vget_lane_f64(sum, 0);
76+
}

src/VecSim/spaces/L2/L2_SVE_FP64.h

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_sve.h>
9+
10+
inline void L2SquareStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum,
11+
const size_t chunk) {
12+
// Load vectors
13+
svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset);
14+
svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset);
15+
16+
// Calculate difference between vectors
17+
svfloat64_t diff = svsub_f64_x(svptrue_b64(), v1, v2);
18+
19+
// Square the difference and accumulate: sum += diff * diff
20+
sum = svmla_f64_x(svptrue_b64(), sum, diff, diff);
21+
22+
// Advance pointers by the vector length
23+
offset += chunk;
24+
}
25+
26+
template <bool partial_chunk, unsigned char additional_steps>
27+
double FP64_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
28+
double *pVect1 = (double *)pVect1v;
29+
double *pVect2 = (double *)pVect2v;
30+
const size_t chunk = svcntd();
31+
size_t offset = 0;
32+
33+
// Multiple accumulators to increase instruction-level parallelism
34+
svfloat64_t sum0 = svdup_f64(0.0);
35+
svfloat64_t sum1 = svdup_f64(0.0);
36+
svfloat64_t sum2 = svdup_f64(0.0);
37+
svfloat64_t sum3 = svdup_f64(0.0);
38+
39+
// Process vectors in chunks, with unrolling for better pipelining
40+
auto chunk_size = 4 * chunk;
41+
size_t number_of_chunks = dimension / chunk_size;
42+
for (size_t i = 0; i < number_of_chunks; ++i) {
43+
// Process 4 chunks with separate accumulators
44+
L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
45+
L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
46+
L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
47+
L2SquareStep(pVect1, pVect2, offset, sum3, chunk);
48+
}
49+
50+
if constexpr (additional_steps >= 1) {
51+
L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
52+
}
53+
if constexpr (additional_steps >= 2) {
54+
L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
55+
}
56+
if constexpr (additional_steps >= 3) {
57+
L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
58+
}
59+
60+
if constexpr (partial_chunk) {
61+
svbool_t pg =
62+
svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
63+
64+
// Load vectors with predication
65+
svfloat64_t v1 = svld1_f64(pg, pVect1 + offset);
66+
svfloat64_t v2 = svld1_f64(pg, pVect2 + offset);
67+
68+
// Calculate difference with predication (corrected)
69+
svfloat64_t diff = svsub_f64_x(pg, v1, v2);
70+
71+
// Square the difference and accumulate with predication
72+
sum3 = svmla_f64_m(pg, sum3, diff, diff);
73+
}
74+
75+
// Combine the partial sums
76+
sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1);
77+
sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3);
78+
svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2);
79+
double result = svaddv_f64(svptrue_b64(), sum_all);
80+
return result;
81+
}

0 commit comments

Comments
 (0)