Skip to content

Commit 3e58052

Browse files
[0.6] Beng arm opt f64 - [MOD-9077] (#648)
* Beng arm opt f64 - [MOD-9077] (#618) * Add arm support * Changed the arm cpu info * Add ip test * Add to tests * Added tests andbm * fix tests * Add github benchmakrs * Check 1 * only arm * change ami * Try ireland * Try different image * try image * back to old image * larger image * Add option to change env * back to default region * Created new image * Try to add the x86 to check * Try different machine * added include * Try without opti on arm * Change to c6g * added matrix region * change to west * try the i8 * Try oregon * Change subnet id * Now subnet * Change subnet * add subnet * Try group id * Change to vpc id * change subnet * Change ami * Try without subnet * add security group again * Change the subnets * Change to ids * Change sg * psubnet * Try different * different * to a file * print * p * leave empty * empty * Try different account * Run 2 arm machines * Move both to us-west-2 * Try workflow * Change name * Changes * Change the secrets * Add supprted arch * Add defaults * Support all * Change the jq * Change machine to t4g * Change the name * Change the machine * fix the stop * only benchamrk * add the secrets * region secret * benchmark region * Change timeout * Added support for arch name in benchamrks * change th json * changed to v9.0 * Change the check * add v9 * Check alt version of armv9 * added check * add arc_arch * changed to CONCAT_WITH_UNDERSCORE_ARCH * change the check * Add full check * fix the instruct * Added the cmake * fix the support * put it back to cmake * back * change the condition * No armpl for now * cland format * remove the opt * Changed to one machine * Added BENCHMARK_ARCH * fix endif * Remove secrets call * pr changes * Changes * change to compile * add sve * add #endif * add armpl * add to cmake * remove armpl * add install * Add ARCH=$(uname -m) * change the path to armpl * suuport check for armv7 * change the armpl * Change or OR * add neon supported for spaces * add sve * add support * align * format * change error * change * Removed the ifdef * Add comments * clang * Change names * format * Try fp32 neon simd * add l2 * add cmake * add SVE * fix sve l2 * PR changes * Change to 1 * fix the l2 * fix format * F64 ARMPL optimizations for SVE and NEON * add desciriopn for chunk == 1 * Change functions * Add include * Change the cast * add resudual * formatting * Move th consexpt * remove template armpl * Back to armpl * back to armpl_neon * include * armnpl * add choose * fix the residual div * raise the residuals values * back to char * add ip and l2 intrinsic opt * changes from f32 to f64 * changes to match f32 * small fixes * small fixes * Remove prefetch * Revert implemetion chooser * Remove armpl * Revert remove error * Remove comment * Remove empty line * try to fix neo f64 * try to get arm opt to work * try to get arm opt to work * try to get arm opt to work * try to get arm opt to work * format * removed files * remove armpl * Add support macos * add sudo * Add absolute path * find all libs * Change folder * Now set for real * try to get arm opt to work * Remove armpl from pull * try to get arm opt to work * try to get arm opt to work * try to get arm opt to work * try to get arm opt to work * try to test fp64 opt * try to test fp64 opt * fix sve2 implementation * fix fp64 opt test * fix fp64 opt test * change the templates * change chunk size to 1 * Back to 4 * chenge to comply with fp32 implementation * fix sve2 * fix ip template * small fix * Removed the for * Change to 2 sums * Changed * Add get opt func * Change the var name * format * Pr fixes * chage implementation to match fp32 * small fix * small fix * small fix * samll fix * changes to test * small fix * format changes * PR * pr changes * pr * try to fix ci tests * pr fix * PR * added conversion * small dim for intel only * Test smallDimChooser only for intel * remove sve2 h files * pr changes * pr changes * pr changes * pr changes * pr changes * pr changes * pr changes * format changes * casting for mac os --------- Co-authored-by: Dor Forer <dor.forer@redis.com> * add fp64 stuff * fix fp64 impl * format * revert * fix tests * fix tests * remove dir * format --------- Co-authored-by: Dor Forer <dor.forer@redis.com>
1 parent 82f51e9 commit 3e58052

File tree

14 files changed

+491
-11
lines changed

14 files changed

+491
-11
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
inline void InnerProductStep(double *&pVect1, double *&pVect2, float64x2_t &sum) {
11+
float64x2_t v1 = vld1q_f64(pVect1);
12+
float64x2_t v2 = vld1q_f64(pVect2);
13+
sum = vmlaq_f64(sum, v1, v2);
14+
pVect1 += 2;
15+
pVect2 += 2;
16+
}
17+
18+
template <unsigned char residual> // 0..7
19+
double FP64_InnerProductSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
20+
double *pVect1 = (double *)pVect1v;
21+
double *pVect2 = (double *)pVect2v;
22+
23+
float64x2_t sum0 = vdupq_n_f64(0.0);
24+
float64x2_t sum1 = vdupq_n_f64(0.0);
25+
float64x2_t sum2 = vdupq_n_f64(0.0);
26+
float64x2_t sum3 = vdupq_n_f64(0.0);
27+
28+
const size_t num_of_chunks = dimension / 8;
29+
30+
for (size_t i = 0; i < num_of_chunks; i++) {
31+
InnerProductStep(pVect1, pVect2, sum0);
32+
InnerProductStep(pVect1, pVect2, sum1);
33+
InnerProductStep(pVect1, pVect2, sum2);
34+
InnerProductStep(pVect1, pVect2, sum3);
35+
}
36+
37+
// Handle remaining complete 2-float blocks within residual
38+
constexpr size_t remaining_chunks = residual / 2;
39+
// Unrolled loop for the 2-float blocks
40+
if constexpr (remaining_chunks >= 1) {
41+
InnerProductStep(pVect1, pVect2, sum0);
42+
}
43+
if constexpr (remaining_chunks >= 2) {
44+
InnerProductStep(pVect1, pVect2, sum1);
45+
}
46+
if constexpr (remaining_chunks >= 3) {
47+
InnerProductStep(pVect1, pVect2, sum2);
48+
}
49+
50+
// Handle final residual elements (0-1 elements)
51+
// This entire block is eliminated at compile time if final_residual is 0
52+
constexpr size_t final_residual = residual % 2; // Final 0-1 elements
53+
if constexpr (final_residual == 1) {
54+
float64x2_t v1 = vdupq_n_f64(0.0);
55+
float64x2_t v2 = vdupq_n_f64(0.0);
56+
v1 = vld1q_lane_f64(pVect1, v1, 0);
57+
v2 = vld1q_lane_f64(pVect2, v2, 0);
58+
59+
sum3 = vmlaq_f64(sum3, v1, v2);
60+
}
61+
62+
float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3));
63+
64+
// Horizontal sum of the 4 elements in the NEON register
65+
float64x1_t summed = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined));
66+
double sum = vget_lane_f64(summed, 0);
67+
68+
return 1.0 - sum;
69+
}

src/VecSim/spaces/IP/IP_SVE_FP64.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
9+
#include <arm_sve.h>
10+
11+
inline void InnerProductStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum,
12+
const size_t chunk) {
13+
// Load vectors
14+
svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset);
15+
svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset);
16+
17+
// Multiply-accumulate
18+
sum = svmla_f64_x(svptrue_b64(), sum, v1, v2);
19+
20+
// Advance pointers
21+
offset += chunk;
22+
}
23+
24+
template <bool partial_chunk, unsigned char additional_steps>
25+
double FP64_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
26+
double *pVect1 = (double *)pVect1v;
27+
double *pVect2 = (double *)pVect2v;
28+
const size_t chunk = svcntd();
29+
size_t offset = 0;
30+
31+
// Multiple accumulators to increase instruction-level parallelism
32+
svfloat64_t sum0 = svdup_f64(0.0);
33+
svfloat64_t sum1 = svdup_f64(0.0);
34+
svfloat64_t sum2 = svdup_f64(0.0);
35+
svfloat64_t sum3 = svdup_f64(0.0);
36+
37+
auto chunk_size = 4 * chunk;
38+
size_t number_of_chunks = dimension / chunk_size;
39+
for (size_t i = 0; i < number_of_chunks; i++) {
40+
InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
41+
InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
42+
InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
43+
InnerProductStep(pVect1, pVect2, offset, sum3, chunk);
44+
}
45+
46+
if constexpr (additional_steps >= 1) {
47+
InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
48+
}
49+
if constexpr (additional_steps >= 2) {
50+
InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
51+
}
52+
if constexpr (additional_steps >= 3) {
53+
InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
54+
}
55+
56+
if constexpr (partial_chunk) {
57+
svbool_t pg =
58+
svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
59+
svfloat64_t v1 = svld1_f64(pg, pVect1 + offset);
60+
svfloat64_t v2 = svld1_f64(pg, pVect2 + offset);
61+
sum3 = svmla_f64_m(pg, sum3, v1, v2);
62+
}
63+
64+
// Combine the partial sums
65+
sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1);
66+
sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3);
67+
68+
// Perform vector addition in parallel
69+
svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2);
70+
// Single horizontal reduction at the end
71+
double result = svaddv_f64(svptrue_b64(), sum_all);
72+
return 1.0 - result;
73+
}

src/VecSim/spaces/IP_space.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,10 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
9191
#ifdef CPU_FEATURES_ARCH_X86_64
9292

9393
CalculationGuideline optimization_type = FP64_GetCalculationGuideline(dim);
94+
#endif
9495

9596
switch (arch_opt) {
97+
#ifdef CPU_FEATURES_ARCH_X86_64
9698
case ARCH_OPT_AVX512_DQ:
9799
#ifdef OPT_AVX512DQ
98100
{
@@ -140,10 +142,29 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
140142
ret_dist_func = dist_funcs[optimization_type];
141143
} break;
142144
#endif
145+
#endif // __x86_64__ */
146+
#ifdef CPU_FEATURES_ARCH_AARCH64
147+
case ARCH_OPT_SVE2:
148+
#ifdef OPT_SVE2
149+
ret_dist_func = Choose_FP64_IP_implementation_SVE2(dim);
150+
break;
151+
152+
#endif
153+
case ARCH_OPT_SVE:
154+
#ifdef OPT_SVE
155+
ret_dist_func = Choose_FP64_IP_implementation_SVE(dim);
156+
break;
157+
158+
#endif
159+
case ARCH_OPT_NEON:
160+
#ifdef OPT_NEON
161+
ret_dist_func = Choose_FP64_IP_implementation_NEON(dim);
162+
break;
163+
#endif
164+
#endif // CPU_FEATURES_ARCH_AARCH64
143165
case ARCH_OPT_NONE:
144166
break;
145167
} // switch
146-
#endif // __x86_64__ */
147168
return ret_dist_func;
148169
}
149170

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
inline void L2SquareStep(double *&pVect1, double *&pVect2, float64x2_t &sum) {
11+
float64x2_t v1 = vld1q_f64(pVect1);
12+
float64x2_t v2 = vld1q_f64(pVect2);
13+
14+
// Calculate difference between vectors
15+
float64x2_t diff = vsubq_f64(v1, v2);
16+
17+
// Square and accumulate
18+
sum = vmlaq_f64(sum, diff, diff);
19+
20+
pVect1 += 2;
21+
pVect2 += 2;
22+
}
23+
24+
template <unsigned char residual> // 0..7
25+
double FP64_L2SqrSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
26+
double *pVect1 = (double *)pVect1v;
27+
double *pVect2 = (double *)pVect2v;
28+
29+
float64x2_t sum0 = vdupq_n_f64(0.0);
30+
float64x2_t sum1 = vdupq_n_f64(0.0);
31+
float64x2_t sum2 = vdupq_n_f64(0.0);
32+
float64x2_t sum3 = vdupq_n_f64(0.0);
33+
// These are compile-time constants derived from the template parameter
34+
35+
// Calculate how many full 8-element blocks to process
36+
const size_t num_of_chunks = dimension / 8;
37+
38+
for (size_t i = 0; i < num_of_chunks; i++) {
39+
L2SquareStep(pVect1, pVect2, sum0);
40+
L2SquareStep(pVect1, pVect2, sum1);
41+
L2SquareStep(pVect1, pVect2, sum2);
42+
L2SquareStep(pVect1, pVect2, sum3);
43+
}
44+
45+
// Handle remaining complete 2-float blocks within residual
46+
constexpr size_t remaining_chunks = residual / 2;
47+
// Unrolled loop for the 2-float blocks
48+
if constexpr (remaining_chunks >= 1) {
49+
L2SquareStep(pVect1, pVect2, sum0);
50+
}
51+
if constexpr (remaining_chunks >= 2) {
52+
L2SquareStep(pVect1, pVect2, sum1);
53+
}
54+
if constexpr (remaining_chunks >= 3) {
55+
L2SquareStep(pVect1, pVect2, sum2);
56+
}
57+
58+
// Handle final residual element
59+
constexpr size_t final_residual = residual % 2; // Final element
60+
if constexpr (final_residual > 0) {
61+
float64x2_t v1 = vdupq_n_f64(0.0);
62+
float64x2_t v2 = vdupq_n_f64(0.0);
63+
v1 = vld1q_lane_f64(pVect1, v1, 0);
64+
v2 = vld1q_lane_f64(pVect2, v2, 0);
65+
66+
// Calculate difference and square
67+
float64x2_t diff = vsubq_f64(v1, v2);
68+
sum3 = vmlaq_f64(sum3, diff, diff);
69+
}
70+
71+
float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3));
72+
73+
// Horizontal sum of the 4 elements in the NEON register
74+
float64x1_t sum = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined));
75+
return vget_lane_f64(sum, 0);
76+
}

src/VecSim/spaces/L2/L2_SVE_FP64.h

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_sve.h>
9+
10+
inline void L2SquareStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum,
11+
const size_t chunk) {
12+
// Load vectors
13+
svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset);
14+
svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset);
15+
16+
// Calculate difference between vectors
17+
svfloat64_t diff = svsub_f64_x(svptrue_b64(), v1, v2);
18+
19+
// Square the difference and accumulate: sum += diff * diff
20+
sum = svmla_f64_x(svptrue_b64(), sum, diff, diff);
21+
22+
// Advance pointers by the vector length
23+
offset += chunk;
24+
}
25+
26+
template <bool partial_chunk, unsigned char additional_steps>
27+
double FP64_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
28+
double *pVect1 = (double *)pVect1v;
29+
double *pVect2 = (double *)pVect2v;
30+
const size_t chunk = svcntd();
31+
size_t offset = 0;
32+
33+
// Multiple accumulators to increase instruction-level parallelism
34+
svfloat64_t sum0 = svdup_f64(0.0);
35+
svfloat64_t sum1 = svdup_f64(0.0);
36+
svfloat64_t sum2 = svdup_f64(0.0);
37+
svfloat64_t sum3 = svdup_f64(0.0);
38+
39+
// Process vectors in chunks, with unrolling for better pipelining
40+
auto chunk_size = 4 * chunk;
41+
size_t number_of_chunks = dimension / chunk_size;
42+
for (size_t i = 0; i < number_of_chunks; ++i) {
43+
// Process 4 chunks with separate accumulators
44+
L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
45+
L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
46+
L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
47+
L2SquareStep(pVect1, pVect2, offset, sum3, chunk);
48+
}
49+
50+
if constexpr (additional_steps >= 1) {
51+
L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
52+
}
53+
if constexpr (additional_steps >= 2) {
54+
L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
55+
}
56+
if constexpr (additional_steps >= 3) {
57+
L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
58+
}
59+
60+
if constexpr (partial_chunk) {
61+
svbool_t pg =
62+
svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
63+
64+
// Load vectors with predication
65+
svfloat64_t v1 = svld1_f64(pg, pVect1 + offset);
66+
svfloat64_t v2 = svld1_f64(pg, pVect2 + offset);
67+
68+
// Calculate difference with predication (corrected)
69+
svfloat64_t diff = svsub_f64_x(pg, v1, v2);
70+
71+
// Square the difference and accumulate with predication
72+
sum3 = svmla_f64_m(pg, sum3, diff, diff);
73+
}
74+
75+
// Combine the partial sums
76+
sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1);
77+
sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3);
78+
svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2);
79+
double result = svaddv_f64(svptrue_b64(), sum_all);
80+
return result;
81+
}

src/VecSim/spaces/L2_space.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,11 @@ dist_func_t<double> L2_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
9393

9494
dist_func_t<double> ret_dist_func = FP64_L2Sqr;
9595
#ifdef CPU_FEATURES_ARCH_X86_64
96-
9796
CalculationGuideline optimization_type = FP64_GetCalculationGuideline(dim);
97+
#endif
9898

9999
switch (arch_opt) {
100+
#ifdef CPU_FEATURES_ARCH_X86_64
100101
case ARCH_OPT_AVX512_DQ:
101102
#ifdef OPT_AVX512DQ
102103
{
@@ -143,11 +144,28 @@ dist_func_t<double> L2_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
143144
ret_dist_func = dist_funcs[optimization_type];
144145
} break;
145146
#endif
147+
#endif // __x86_64__
148+
149+
#ifdef CPU_FEATURES_ARCH_AARCH64
150+
case ARCH_OPT_SVE2:
151+
#ifdef OPT_SVE2
152+
ret_dist_func = Choose_FP64_L2_implementation_SVE2(dim);
153+
break;
154+
#endif
155+
case ARCH_OPT_SVE:
156+
#ifdef OPT_SVE
157+
ret_dist_func = Choose_FP64_L2_implementation_SVE(dim);
158+
break;
159+
#endif
160+
case ARCH_OPT_NEON:
161+
#ifdef OPT_NEON
162+
ret_dist_func = Choose_FP64_L2_implementation_NEON(dim);
163+
break;
164+
#endif
165+
#endif // __aarch64__
146166
case ARCH_OPT_NONE:
147167
break;
148168
} // switch
149-
150-
#endif // __x86_64__ */
151169
return ret_dist_func;
152170
}
153171

0 commit comments

Comments
 (0)