Skip to content

Commit 82f51e9

Browse files
dor-forerlerman25meiravgri
authored
[0.6] Add support arm opt fp32 intrinsics [MOD-9011] (#636)
* Add support arm opt fp32 intrinsics [MOD-9011] (#617) * Add arm support * Changed the arm cpu info * Add ip test * Add to tests * Added tests andbm * fix tests * Add github benchmakrs * Check 1 * only arm * change ami * Try ireland * Try different image * try image * back to old image * larger image * Add option to change env * back to default region * Created new image * Try to add the x86 to check * Try different machine * added include * Try without opti on arm * Change to c6g * added matrix region * change to west * try the i8 * Try oregon * Change subnet id * Now subnet * Change subnet * add subnet * Try group id * Change to vpc id * change subnet * Change ami * Try without subnet * add security group again * Change the subnets * Change to ids * Change sg * psubnet * Try different * different * to a file * print * p * leave empty * empty * Try different account * Run 2 arm machines * Move both to us-west-2 * Try workflow * Change name * Changes * Change the secrets * Add supprted arch * Add defaults * Support all * Change the jq * Change machine to t4g * Change the name * Change the machine * fix the stop * only benchamrk * add the secrets * region secret * benchmark region * Change timeout * Added support for arch name in benchamrks * change th json * changed to v9.0 * Change the check * add v9 * Check alt version of armv9 * added check * add arc_arch * changed to CONCAT_WITH_UNDERSCORE_ARCH * change the check * Add full check * fix the instruct * Added the cmake * fix the support * put it back to cmake * back * change the condition * No armpl for now * cland format * remove the opt * Changed to one machine * Added BENCHMARK_ARCH * fix endif * Remove secrets call * pr changes * Changes * change to compile * add sve * add #endif * add armpl * add to cmake * remove armpl * add install * Add ARCH=$(uname -m) * change the path to armpl * suuport check for armv7 * change the armpl * Change or OR * add neon supported for spaces * add sve * add support * align * format * change error * change * Removed the ifdef * Add comments * clang * Change names * format * Try fp32 neon simd * add l2 * add cmake * add SVE * fix sve l2 * PR changes * Change to 1 * fix the l2 * fix format * add desciriopn for chunk == 1 * Change functions * Add include * Change the cast * add resudual * formatting * Move th consexpt * remove template armpl * Back to armpl * back to armpl_neon * include * armnpl * add choose * fix the residual div * raise the residuals values * back to char * Remove prefetch * Revert implemetion chooser * Remove armpl * Revert remove error * Remove comment * Remove empty line * format * Add support macos * add sudo * Add absolute path * find all libs * Change folder * Now set for real * Remove armpl from pull * change the templates * change chunk size to 1 * Back to 4 * Removed the for * Change to 2 sums * Changed * Add get opt func * Change the var name * format * Pr fixes * PR * pr * pr fix * PR * added conversion * small dim for intel only * Test smallDimChooser only for intel (cherry picked from commit b996755) * changes * Revert "changes" This reverts commit a0a00c5. * Revert "Add support arm opt fp32 intrinsics [MOD-9011] (#617)" This reverts commit c97d347. * Changes * Format * Change tests * fix tests * format * PR changes * new line * tests * format * Add support arm opt fp32 intrinsics [MOD-9011] (#617) * Add arm support * Changed the arm cpu info * Add ip test * Add to tests * Added tests andbm * fix tests * Add github benchmakrs * Check 1 * only arm * change ami * Try ireland * Try different image * try image * back to old image * larger image * Add option to change env * back to default region * Created new image * Try to add the x86 to check * Try different machine * added include * Try without opti on arm * Change to c6g * added matrix region * change to west * try the i8 * Try oregon * Change subnet id * Now subnet * Change subnet * add subnet * Try group id * Change to vpc id * change subnet * Change ami * Try without subnet * add security group again * Change the subnets * Change to ids * Change sg * psubnet * Try different * different * to a file * print * p * leave empty * empty * Try different account * Run 2 arm machines * Move both to us-west-2 * Try workflow * Change name * Changes * Change the secrets * Add supprted arch * Add defaults * Support all * Change the jq * Change machine to t4g * Change the name * Change the machine * fix the stop * only benchamrk * add the secrets * region secret * benchmark region * Change timeout * Added support for arch name in benchamrks * change th json * changed to v9.0 * Change the check * add v9 * Check alt version of armv9 * added check * add arc_arch * changed to CONCAT_WITH_UNDERSCORE_ARCH * change the check * Add full check * fix the instruct * Added the cmake * fix the support * put it back to cmake * back * change the condition * No armpl for now * cland format * remove the opt * Changed to one machine * Added BENCHMARK_ARCH * fix endif * Remove secrets call * pr changes * Changes * change to compile * add sve * add #endif * add armpl * add to cmake * remove armpl * add install * Add ARCH=$(uname -m) * change the path to armpl * suuport check for armv7 * change the armpl * Change or OR * add neon supported for spaces * add sve * add support * align * format * change error * change * Removed the ifdef * Add comments * clang * Change names * format * Try fp32 neon simd * add l2 * add cmake * add SVE * fix sve l2 * PR changes * Change to 1 * fix the l2 * fix format * add desciriopn for chunk == 1 * Change functions * Add include * Change the cast * add resudual * formatting * Move th consexpt * remove template armpl * Back to armpl * back to armpl_neon * include * armnpl * add choose * fix the residual div * raise the residuals values * back to char * Remove prefetch * Revert implemetion chooser * Remove armpl * Revert remove error * Remove comment * Remove empty line * format * Add support macos * add sudo * Add absolute path * find all libs * Change folder * Now set for real * Remove armpl from pull * change the templates * change chunk size to 1 * Back to 4 * Removed the for * Change to 2 sums * Changed * Add get opt func * Change the var name * format * Pr fixes * PR * pr * pr fix * PR * added conversion * small dim for intel only * Test smallDimChooser only for intel (cherry picked from commit b996755) * changes * Revert "changes" This reverts commit a0a00c5. * Revert "Add support arm opt fp32 intrinsics [MOD-9011] (#617)" This reverts commit c97d347. * Changes * Format * Change tests * fix tests * format * PR changes * new line * tests * format * Add support arm opt fp32 intrinsics [MOD-9011] (#617) * Add arm support * Changed the arm cpu info * Add ip test * Add to tests * Added tests andbm * fix tests * Add github benchmakrs * Check 1 * only arm * change ami * Try ireland * Try different image * try image * back to old image * larger image * Add option to change env * back to default region * Created new image * Try to add the x86 to check * Try different machine * added include * Try without opti on arm * Change to c6g * added matrix region * change to west * try the i8 * Try oregon * Change subnet id * Now subnet * Change subnet * add subnet * Try group id * Change to vpc id * change subnet * Change ami * Try without subnet * add security group again * Change the subnets * Change to ids * Change sg * psubnet * Try different * different * to a file * print * p * leave empty * empty * Try different account * Run 2 arm machines * Move both to us-west-2 * Try workflow * Change name * Changes * Change the secrets * Add supprted arch * Add defaults * Support all * Change the jq * Change machine to t4g * Change the name * Change the machine * fix the stop * only benchamrk * add the secrets * region secret * benchmark region * Change timeout * Added support for arch name in benchamrks * change th json * changed to v9.0 * Change the check * add v9 * Check alt version of armv9 * added check * add arc_arch * changed to CONCAT_WITH_UNDERSCORE_ARCH * change the check * Add full check * fix the instruct * Added the cmake * fix the support * put it back to cmake * back * change the condition * No armpl for now * cland format * remove the opt * Changed to one machine * Added BENCHMARK_ARCH * fix endif * Remove secrets call * pr changes * Changes * change to compile * add sve * add #endif * add armpl * add to cmake * remove armpl * add install * Add ARCH=$(uname -m) * change the path to armpl * suuport check for armv7 * change the armpl * Change or OR * add neon supported for spaces * add sve * add support * align * format * change error * change * Removed the ifdef * Add comments * clang * Change names * format * Try fp32 neon simd * add l2 * add cmake * add SVE * fix sve l2 * PR changes * Change to 1 * fix the l2 * fix format * add desciriopn for chunk == 1 * Change functions * Add include * Change the cast * add resudual * formatting * Move th consexpt * remove template armpl * Back to armpl * back to armpl_neon * include * armnpl * add choose * fix the residual div * raise the residuals values * back to char * Remove prefetch * Revert implemetion chooser * Remove armpl * Revert remove error * Remove comment * Remove empty line * format * Add support macos * add sudo * Add absolute path * find all libs * Change folder * Now set for real * Remove armpl from pull * change the templates * change chunk size to 1 * Back to 4 * Removed the for * Change to 2 sums * Changed * Add get opt func * Change the var name * format * Pr fixes * PR * pr * pr fix * PR * added conversion * small dim for intel only * Test smallDimChooser only for intel (cherry picked from commit b996755) * changes * Revert "changes" This reverts commit a0a00c5. * Revert "Add support arm opt fp32 intrinsics [MOD-9011] (#617)" This reverts commit c97d347. * Changes * Format * Change tests * fix tests * format * PR changes * new line * unndeed * remove * changes * format * [0.6] [MOD-9303] Update GoogleTest tag to support CMake 4.0 compatibility (#637) [MOD-9303] Update GoogleTest tag to support CMake 4.0 compatibility (#630) * use googletest 1.16.0 remove pin cmake version (not needed as we meet cmake requirments of cmake 4.0) * disbale isntall cmake in codeql * disable temp flow remove install cmake from codeql (cherry picked from commit b983c5b) * Build the test at runtime * Fix length --------- Co-authored-by: Omer <lerman25@gmail.com> Co-authored-by: meiravgri <109056284+meiravgri@users.noreply.github.com>
1 parent 1476b87 commit 82f51e9

25 files changed

+893
-21
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
include(CheckCXXCompilerFlag)
2+
3+
4+
message(STATUS "Building for ARM aarch64")
5+
6+
# Check what compiler flags are supported
7+
CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
8+
CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
9+
CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
10+
CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
11+
12+
# Only use ARMv9 if both compiler and CPU support it
13+
if(CXX_SVE2)
14+
message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
15+
add_compile_definitions(OPT_SVE2)
16+
endif()
17+
if (CXX_ARMV8A OR CXX_ARMV7_NEON)
18+
add_compile_definitions(OPT_NEON)
19+
endif()
20+
if (CXX_SVE)
21+
add_compile_definitions(OPT_SVE)
22+
endif()

src/VecSim/spaces/CMakeLists.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,40 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
9393
endif()
9494

9595
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
96+
set(ARM_OPTIMIZATIONS "")
97+
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
98+
include_directories(src)
99+
include(${root}/cmake/aarch64InstructionFlags.cmake)
100+
101+
# Create different optimization implementations for ARM architecture
102+
if (CXX_ARMV8A)
103+
message("Building with ARMV8A")
104+
set_source_files_properties(functions/NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a")
105+
list(APPEND ARM_OPTIMIZATIONS functions/NEON.cpp)
106+
endif()
107+
108+
# SVE support
109+
if (CXX_SVE)
110+
message("Building with SVE")
111+
set_source_files_properties(functions/SVE.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a+sve")
112+
list(APPEND ARM_OPTIMIZATIONS functions/SVE.cpp)
113+
endif()
114+
115+
# SVE2 support
116+
if (CXX_SVE2)
117+
message("Building with ARMV9A and SVE2")
118+
set_source_files_properties(functions/SVE2.cpp PROPERTIES COMPILE_FLAGS "-march=armv9-a+sve2")
119+
list(APPEND ARM_OPTIMIZATIONS functions/SVE2.cpp)
120+
endif()
121+
endif()
96122

97123
# Here we are compiling the space selectors with the relevant optimization flag.
98124
add_library(VectorSimilaritySpaces
99125
space_aux.cpp
100126
L2_space.cpp
101127
IP_space.cpp
102128
spaces.cpp
129+
${ARM_OPTIMIZATIONS}
103130
)
104131

105132
target_link_libraries(VectorSimilaritySpaces cpu_features VectorSimilaritySpaces_no_optimization ${OPTIMIZATIONS})
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
static inline void InnerProductStep(float *&pVect1, float *&pVect2, float32x4_t &sum) {
11+
float32x4_t v1 = vld1q_f32(pVect1);
12+
float32x4_t v2 = vld1q_f32(pVect2);
13+
sum = vmlaq_f32(sum, v1, v2);
14+
pVect1 += 4;
15+
pVect2 += 4;
16+
}
17+
18+
template <unsigned char residual> // 0..15
19+
float FP32_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
20+
float *pVect1 = (float *)pVect1v;
21+
float *pVect2 = (float *)pVect2v;
22+
23+
float32x4_t sum0 = vdupq_n_f32(0.0f);
24+
float32x4_t sum1 = vdupq_n_f32(0.0f);
25+
float32x4_t sum2 = vdupq_n_f32(0.0f);
26+
float32x4_t sum3 = vdupq_n_f32(0.0f);
27+
28+
const size_t num_of_chunks = dimension / 16;
29+
30+
for (size_t i = 0; i < num_of_chunks; i++) {
31+
InnerProductStep(pVect1, pVect2, sum0);
32+
InnerProductStep(pVect1, pVect2, sum1);
33+
InnerProductStep(pVect1, pVect2, sum2);
34+
InnerProductStep(pVect1, pVect2, sum3);
35+
}
36+
37+
// Handle remaining complete 4-float blocks within residual
38+
constexpr size_t remaining_chunks = residual / 4;
39+
40+
// Unrolled loop for the 4-float blocks
41+
if constexpr (remaining_chunks >= 1) {
42+
InnerProductStep(pVect1, pVect2, sum0);
43+
}
44+
if constexpr (remaining_chunks >= 2) {
45+
InnerProductStep(pVect1, pVect2, sum1);
46+
}
47+
if constexpr (remaining_chunks >= 3) {
48+
InnerProductStep(pVect1, pVect2, sum2);
49+
}
50+
51+
// Handle final residual elements (0-3 elements)
52+
constexpr size_t final_residual = residual % 4;
53+
if constexpr (final_residual > 0) {
54+
float32x4_t v1 = vdupq_n_f32(0.0f);
55+
float32x4_t v2 = vdupq_n_f32(0.0f);
56+
57+
if constexpr (final_residual >= 1) {
58+
v1 = vld1q_lane_f32(pVect1, v1, 0);
59+
v2 = vld1q_lane_f32(pVect2, v2, 0);
60+
}
61+
if constexpr (final_residual >= 2) {
62+
v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
63+
v2 = vld1q_lane_f32(pVect2 + 1, v2, 1);
64+
}
65+
if constexpr (final_residual >= 3) {
66+
v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
67+
v2 = vld1q_lane_f32(pVect2 + 2, v2, 2);
68+
}
69+
70+
sum3 = vmlaq_f32(sum3, v1, v2);
71+
}
72+
73+
// Combine all four sum accumulators
74+
float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
75+
76+
// Horizontal sum of the 4 elements in the combined NEON register
77+
float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
78+
float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
79+
float sum = vget_lane_f32(summed, 0);
80+
81+
return 1.0f - sum;
82+
}

src/VecSim/spaces/IP/IP_SVE_FP32.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
9+
#include <arm_sve.h>
10+
11+
static inline void InnerProductStep(float *&pVect1, float *&pVect2, size_t &offset,
12+
svfloat32_t &sum) {
13+
svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset);
14+
svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset);
15+
16+
sum = svmla_f32_x(svptrue_b32(), sum, v1, v2);
17+
18+
offset += svcntw();
19+
}
20+
21+
template <bool partial_chunk, unsigned char additional_steps>
22+
float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
23+
float *pVect1 = (float *)pVect1v;
24+
float *pVect2 = (float *)pVect2v;
25+
size_t offset = 0;
26+
27+
uint64_t sve_word_count = svcntw();
28+
29+
svfloat32_t sum0 = svdup_f32(0.0f);
30+
svfloat32_t sum1 = svdup_f32(0.0f);
31+
svfloat32_t sum2 = svdup_f32(0.0f);
32+
svfloat32_t sum3 = svdup_f32(0.0f);
33+
34+
auto chunk_size = 4 * sve_word_count;
35+
const size_t number_of_chunks = dimension / chunk_size;
36+
for (size_t i = 0; i < number_of_chunks; i++) {
37+
InnerProductStep(pVect1, pVect2, offset, sum0);
38+
InnerProductStep(pVect1, pVect2, offset, sum1);
39+
InnerProductStep(pVect1, pVect2, offset, sum2);
40+
InnerProductStep(pVect1, pVect2, offset, sum3);
41+
}
42+
43+
// Process remaining complete SVE vectors that didn't fit into the main loop
44+
// These are full vector operations (0-3 elements)
45+
if constexpr (additional_steps > 0) {
46+
if constexpr (additional_steps >= 1) {
47+
InnerProductStep(pVect1, pVect2, offset, sum0);
48+
}
49+
if constexpr (additional_steps >= 2) {
50+
InnerProductStep(pVect1, pVect2, offset, sum1);
51+
}
52+
if constexpr (additional_steps >= 3) {
53+
InnerProductStep(pVect1, pVect2, offset, sum3);
54+
}
55+
}
56+
57+
// Process final tail elements that don't form a complete vector
58+
// This section handles the case when dimension is not evenly divisible by SVE vector length
59+
if constexpr (partial_chunk) {
60+
// Create a predicate mask where each lane is active only for the remaining elements
61+
svbool_t pg =
62+
svwhilelt_b32(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
63+
64+
// Load vectors with predication
65+
svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
66+
svfloat32_t v2 = svld1_f32(pg, pVect2 + offset);
67+
sum3 = svmla_f32_m(pg, sum3, v1, v2);
68+
}
69+
70+
sum0 = svadd_f32_x(svptrue_b32(), sum0, sum1);
71+
sum2 = svadd_f32_x(svptrue_b32(), sum2, sum3);
72+
// Perform vector addition in parallel
73+
svfloat32_t sum_all = svadd_f32_x(svptrue_b32(), sum0, sum2);
74+
// Single horizontal reduction at the end
75+
float result = svaddv_f32(svptrue_b32(), sum_all);
76+
return 1.0f - result;
77+
}

src/VecSim/spaces/IP_space.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,27 @@
1010
#include "VecSim/spaces/IP/IP_AVX512.h"
1111
#include "VecSim/spaces/IP/IP_AVX.h"
1212
#include "VecSim/spaces/IP/IP_SSE.h"
13+
#include "VecSim/spaces/functions/NEON.h"
14+
#include "VecSim/spaces/functions/SVE.h"
15+
#include "VecSim/spaces/functions/SVE2.h"
1316

1417
namespace spaces {
1518
dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_opt) {
1619

1720
dist_func_t<float> ret_dist_func = FP32_InnerProduct;
18-
#ifdef CPU_FEATURES_ARCH_X86_64
1921

22+
#ifdef CPU_FEATURES_ARCH_X86_64
2023
CalculationGuideline optimization_type = FP32_GetCalculationGuideline(dim);
21-
24+
// Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
25+
if (dim < 16) {
26+
return ret_dist_func;
27+
}
28+
#endif
2229
switch (arch_opt) {
30+
#ifdef CPU_FEATURES_ARCH_X86_64
2331
case ARCH_OPT_AVX512_DQ:
2432
case ARCH_OPT_AVX512_F:
33+
2534
#ifdef OPT_AVX512F
2635
{
2736
static dist_func_t<float> dist_funcs[] = {
@@ -52,10 +61,27 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_
5261
ret_dist_func = dist_funcs[optimization_type];
5362
} break;
5463
#endif
64+
#endif // __x86_64__
65+
#ifdef CPU_FEATURES_ARCH_AARCH64
66+
case ARCH_OPT_SVE2:
67+
#ifdef OPT_SVE2
68+
ret_dist_func = Choose_FP32_IP_implementation_SVE2(dim);
69+
break;
70+
#endif
71+
case ARCH_OPT_SVE:
72+
#ifdef OPT_SVE
73+
ret_dist_func = Choose_FP32_IP_implementation_SVE(dim);
74+
break;
75+
#endif
76+
case ARCH_OPT_NEON:
77+
#ifdef OPT_NEON
78+
ret_dist_func = Choose_FP32_IP_implementation_NEON(dim);
79+
break;
80+
#endif
81+
#endif // CPU_FEATURES_ARCH_X86_64
5582
case ARCH_OPT_NONE:
5683
break;
5784
} // switch
58-
#endif // __x86_64__
5985
return ret_dist_func;
6086
}
6187

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
static inline void L2SquareStep(float *&pVect1, float *&pVect2, float32x4_t &sum) {
11+
float32x4_t v1 = vld1q_f32(pVect1);
12+
float32x4_t v2 = vld1q_f32(pVect2);
13+
14+
float32x4_t diff = vsubq_f32(v1, v2);
15+
16+
sum = vmlaq_f32(sum, diff, diff);
17+
18+
pVect1 += 4;
19+
pVect2 += 4;
20+
}
21+
22+
template <unsigned char residual> // 0..15
23+
float FP32_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
24+
float *pVect1 = (float *)pVect1v;
25+
float *pVect2 = (float *)pVect2v;
26+
27+
float32x4_t sum0 = vdupq_n_f32(0.0f);
28+
float32x4_t sum1 = vdupq_n_f32(0.0f);
29+
float32x4_t sum2 = vdupq_n_f32(0.0f);
30+
float32x4_t sum3 = vdupq_n_f32(0.0f);
31+
32+
const size_t num_of_chunks = dimension / 16;
33+
34+
for (size_t i = 0; i < num_of_chunks; i++) {
35+
L2SquareStep(pVect1, pVect2, sum0);
36+
L2SquareStep(pVect1, pVect2, sum1);
37+
L2SquareStep(pVect1, pVect2, sum2);
38+
L2SquareStep(pVect1, pVect2, sum3);
39+
}
40+
41+
// Handle remaining complete 4-float blocks within residual
42+
constexpr size_t remaining_chunks = residual / 4;
43+
// Unrolled loop for the 4-float blocks
44+
if constexpr (remaining_chunks >= 1) {
45+
L2SquareStep(pVect1, pVect2, sum0);
46+
}
47+
if constexpr (remaining_chunks >= 2) {
48+
L2SquareStep(pVect1, pVect2, sum1);
49+
}
50+
if constexpr (remaining_chunks >= 3) {
51+
L2SquareStep(pVect1, pVect2, sum2);
52+
}
53+
54+
// Handle final residual elements (0-3 elements)
55+
constexpr size_t final_residual = residual % 4;
56+
if constexpr (final_residual > 0) {
57+
float32x4_t v1 = vdupq_n_f32(0.0f);
58+
float32x4_t v2 = vdupq_n_f32(0.0f);
59+
60+
if constexpr (final_residual >= 1) {
61+
v1 = vld1q_lane_f32(pVect1, v1, 0);
62+
v2 = vld1q_lane_f32(pVect2, v2, 0);
63+
}
64+
if constexpr (final_residual >= 2) {
65+
v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
66+
v2 = vld1q_lane_f32(pVect2 + 1, v2, 1);
67+
}
68+
if constexpr (final_residual >= 3) {
69+
v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
70+
v2 = vld1q_lane_f32(pVect2 + 2, v2, 2);
71+
}
72+
73+
float32x4_t diff = vsubq_f32(v1, v2);
74+
sum3 = vmlaq_f32(sum3, diff, diff);
75+
}
76+
77+
// Combine all four sum accumulators
78+
float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
79+
80+
// Horizontal sum of the 4 elements in the combined NEON register
81+
float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
82+
float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
83+
float sum = vget_lane_f32(summed, 0);
84+
85+
return sum;
86+
}

0 commit comments

Comments
 (0)