Skip to content

Commit 7858fc8

Browse files
dor-forerlerman25
andauthored
[0.7] Add support arm opt fp32 intrinsics [MOD-9011] (#633)
* Adpat arm to 0.7 * add space aux * Compile changes * format * changes * Changes * format * By arch * format * format * Change the order * Change order * Change the order * remove * change the order --------- Co-authored-by: Omer <lerman25@gmail.com>
1 parent 13fc524 commit 7858fc8

23 files changed

+740
-19
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
include(CheckCXXCompilerFlag)
2+
3+
4+
message(STATUS "Building for ARM aarch64")
5+
6+
# Check what compiler flags are supported
7+
CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
8+
CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
9+
CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
10+
CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
11+
12+
# Only use ARMv9 if both compiler and CPU support it
13+
if(CXX_SVE2)
14+
message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
15+
add_compile_definitions(OPT_SVE2)
16+
endif()
17+
if (CXX_ARMV8A OR CXX_ARMV7_NEON)
18+
add_compile_definitions(OPT_NEON)
19+
endif()
20+
if (CXX_SVE)
21+
add_compile_definitions(OPT_SVE)
22+
endif()

src/VecSim/spaces/CMakeLists.txt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,31 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
4444
endif()
4545
endif()
4646

47+
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
48+
include(${root}/cmake/aarch64InstructionFlags.cmake)
49+
50+
# Create different optimization implementations for ARM architecture
51+
if (CXX_ARMV8A)
52+
message("Building with ARMV8A")
53+
set_source_files_properties(functions/NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a")
54+
list(APPEND OPTIMIZATIONS functions/NEON.cpp)
55+
endif()
56+
57+
# SVE support
58+
if (CXX_SVE)
59+
message("Building with SVE")
60+
set_source_files_properties(functions/SVE.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a+sve")
61+
list(APPEND OPTIMIZATIONS functions/SVE.cpp)
62+
endif()
63+
64+
# SVE2 support
65+
if (CXX_SVE2)
66+
message("Building with ARMV9A and SVE2")
67+
set_source_files_properties(functions/SVE2.cpp PROPERTIES COMPILE_FLAGS "-march=armv9-a+sve2")
68+
list(APPEND OPTIMIZATIONS functions/SVE2.cpp)
69+
endif()
70+
endif()
71+
4772
# Here we are compiling the space selectors with the relevant optimization flag.
4873
add_library(VectorSimilaritySpaces
4974
space_aux.cpp
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
static inline void InnerProductStep(float *&pVect1, float *&pVect2, float32x4_t &sum) {
11+
float32x4_t v1 = vld1q_f32(pVect1);
12+
float32x4_t v2 = vld1q_f32(pVect2);
13+
sum = vmlaq_f32(sum, v1, v2);
14+
pVect1 += 4;
15+
pVect2 += 4;
16+
}
17+
18+
template <unsigned char residual> // 0..15
19+
float FP32_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
20+
float *pVect1 = (float *)pVect1v;
21+
float *pVect2 = (float *)pVect2v;
22+
23+
float32x4_t sum0 = vdupq_n_f32(0.0f);
24+
float32x4_t sum1 = vdupq_n_f32(0.0f);
25+
float32x4_t sum2 = vdupq_n_f32(0.0f);
26+
float32x4_t sum3 = vdupq_n_f32(0.0f);
27+
28+
const size_t num_of_chunks = dimension / 16;
29+
30+
for (size_t i = 0; i < num_of_chunks; i++) {
31+
InnerProductStep(pVect1, pVect2, sum0);
32+
InnerProductStep(pVect1, pVect2, sum1);
33+
InnerProductStep(pVect1, pVect2, sum2);
34+
InnerProductStep(pVect1, pVect2, sum3);
35+
}
36+
37+
// Handle remaining complete 4-float blocks within residual
38+
constexpr size_t remaining_chunks = residual / 4;
39+
40+
// Unrolled loop for the 4-float blocks
41+
if constexpr (remaining_chunks >= 1) {
42+
InnerProductStep(pVect1, pVect2, sum0);
43+
}
44+
if constexpr (remaining_chunks >= 2) {
45+
InnerProductStep(pVect1, pVect2, sum1);
46+
}
47+
if constexpr (remaining_chunks >= 3) {
48+
InnerProductStep(pVect1, pVect2, sum2);
49+
}
50+
51+
// Handle final residual elements (0-3 elements)
52+
constexpr size_t final_residual = residual % 4;
53+
if constexpr (final_residual > 0) {
54+
float32x4_t v1 = vdupq_n_f32(0.0f);
55+
float32x4_t v2 = vdupq_n_f32(0.0f);
56+
57+
if constexpr (final_residual >= 1) {
58+
v1 = vld1q_lane_f32(pVect1, v1, 0);
59+
v2 = vld1q_lane_f32(pVect2, v2, 0);
60+
}
61+
if constexpr (final_residual >= 2) {
62+
v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
63+
v2 = vld1q_lane_f32(pVect2 + 1, v2, 1);
64+
}
65+
if constexpr (final_residual >= 3) {
66+
v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
67+
v2 = vld1q_lane_f32(pVect2 + 2, v2, 2);
68+
}
69+
70+
sum3 = vmlaq_f32(sum3, v1, v2);
71+
}
72+
73+
// Combine all four sum accumulators
74+
float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
75+
76+
// Horizontal sum of the 4 elements in the combined NEON register
77+
float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
78+
float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
79+
float sum = vget_lane_f32(summed, 0);
80+
81+
return 1.0f - sum;
82+
}

src/VecSim/spaces/IP/IP_SVE_FP32.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
9+
#include <arm_sve.h>
10+
11+
static inline void InnerProductStep(float *&pVect1, float *&pVect2, size_t &offset,
12+
svfloat32_t &sum) {
13+
svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset);
14+
svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset);
15+
16+
sum = svmla_f32_x(svptrue_b32(), sum, v1, v2);
17+
18+
offset += svcntw();
19+
}
20+
21+
template <bool partial_chunk, unsigned char additional_steps>
22+
float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
23+
float *pVect1 = (float *)pVect1v;
24+
float *pVect2 = (float *)pVect2v;
25+
size_t offset = 0;
26+
27+
uint64_t sve_word_count = svcntw();
28+
29+
svfloat32_t sum0 = svdup_f32(0.0f);
30+
svfloat32_t sum1 = svdup_f32(0.0f);
31+
svfloat32_t sum2 = svdup_f32(0.0f);
32+
svfloat32_t sum3 = svdup_f32(0.0f);
33+
34+
auto chunk_size = 4 * sve_word_count;
35+
const size_t number_of_chunks = dimension / chunk_size;
36+
for (size_t i = 0; i < number_of_chunks; i++) {
37+
InnerProductStep(pVect1, pVect2, offset, sum0);
38+
InnerProductStep(pVect1, pVect2, offset, sum1);
39+
InnerProductStep(pVect1, pVect2, offset, sum2);
40+
InnerProductStep(pVect1, pVect2, offset, sum3);
41+
}
42+
43+
// Process remaining complete SVE vectors that didn't fit into the main loop
44+
// These are full vector operations (0-3 elements)
45+
if constexpr (additional_steps > 0) {
46+
if constexpr (additional_steps >= 1) {
47+
InnerProductStep(pVect1, pVect2, offset, sum0);
48+
}
49+
if constexpr (additional_steps >= 2) {
50+
InnerProductStep(pVect1, pVect2, offset, sum1);
51+
}
52+
if constexpr (additional_steps >= 3) {
53+
InnerProductStep(pVect1, pVect2, offset, sum3);
54+
}
55+
}
56+
57+
// Process final tail elements that don't form a complete vector
58+
// This section handles the case when dimension is not evenly divisible by SVE vector length
59+
if constexpr (partial_chunk) {
60+
// Create a predicate mask where each lane is active only for the remaining elements
61+
svbool_t pg =
62+
svwhilelt_b32(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
63+
64+
// Load vectors with predication
65+
svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
66+
svfloat32_t v2 = svld1_f32(pg, pVect2 + offset);
67+
sum3 = svmla_f32_m(pg, sum3, v1, v2);
68+
}
69+
70+
sum0 = svadd_f32_x(svptrue_b32(), sum0, sum1);
71+
sum2 = svadd_f32_x(svptrue_b32(), sum2, sum3);
72+
// Perform vector addition in parallel
73+
svfloat32_t sum_all = svadd_f32_x(svptrue_b32(), sum0, sum2);
74+
// Single horizontal reduction at the end
75+
float result = svaddv_f32(svptrue_b32(), sum_all);
76+
return 1.0f - result;
77+
}

src/VecSim/spaces/IP_space.cpp

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
#include "VecSim/spaces/functions/AVX512.h"
1010
#include "VecSim/spaces/functions/AVX.h"
1111
#include "VecSim/spaces/functions/SSE.h"
12+
#include "VecSim/spaces/functions/NEON.h"
13+
#include "VecSim/spaces/functions/SVE.h"
14+
#include "VecSim/spaces/functions/SVE2.h"
1215

1316
namespace spaces {
1417
dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_opt,
@@ -19,13 +22,14 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_
1922
}
2023

2124
dist_func_t<float> ret_dist_func = FP32_InnerProduct;
25+
2226
// Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
2327
if (dim < 16) {
2428
return ret_dist_func;
2529
}
30+
switch (arch_opt) {
2631
#ifdef CPU_FEATURES_ARCH_X86_64
2732

28-
switch (arch_opt) {
2933
case ARCH_OPT_AVX512_F:
3034
#ifdef OPT_AVX512F
3135
ret_dist_func = Choose_FP32_IP_implementation_AVX512(dim);
@@ -47,11 +51,29 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_
4751
*alignment = 4 * sizeof(float); // handles 4 floats
4852
break;
4953
#endif
54+
#endif // __x86_64__
55+
#ifdef CPU_FEATURES_ARCH_AARCH64
56+
case ARCH_OPT_SVE2:
57+
#ifdef OPT_SVE2
58+
ret_dist_func = Choose_FP32_IP_implementation_SVE2(dim);
59+
break;
60+
61+
#endif
62+
case ARCH_OPT_SVE:
63+
#ifdef OPT_SVE
64+
ret_dist_func = Choose_FP32_IP_implementation_SVE(dim);
65+
break;
66+
67+
#endif
68+
case ARCH_OPT_NEON:
69+
#ifdef OPT_NEON
70+
ret_dist_func = Choose_FP32_IP_implementation_NEON(dim);
71+
break;
72+
#endif
73+
#endif // CPU_FEATURES_ARCH_AARCH64
5074
case ARCH_OPT_NONE:
5175
break;
5276
} // switch
53-
54-
#endif // __x86_64__
5577
return ret_dist_func;
5678
}
5779

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
#include <arm_neon.h>
9+
10+
static inline void L2SquareStep(float *&pVect1, float *&pVect2, float32x4_t &sum) {
11+
float32x4_t v1 = vld1q_f32(pVect1);
12+
float32x4_t v2 = vld1q_f32(pVect2);
13+
14+
float32x4_t diff = vsubq_f32(v1, v2);
15+
16+
sum = vmlaq_f32(sum, diff, diff);
17+
18+
pVect1 += 4;
19+
pVect2 += 4;
20+
}
21+
22+
template <unsigned char residual> // 0..15
23+
float FP32_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
24+
float *pVect1 = (float *)pVect1v;
25+
float *pVect2 = (float *)pVect2v;
26+
27+
float32x4_t sum0 = vdupq_n_f32(0.0f);
28+
float32x4_t sum1 = vdupq_n_f32(0.0f);
29+
float32x4_t sum2 = vdupq_n_f32(0.0f);
30+
float32x4_t sum3 = vdupq_n_f32(0.0f);
31+
32+
const size_t num_of_chunks = dimension / 16;
33+
34+
for (size_t i = 0; i < num_of_chunks; i++) {
35+
L2SquareStep(pVect1, pVect2, sum0);
36+
L2SquareStep(pVect1, pVect2, sum1);
37+
L2SquareStep(pVect1, pVect2, sum2);
38+
L2SquareStep(pVect1, pVect2, sum3);
39+
}
40+
41+
// Handle remaining complete 4-float blocks within residual
42+
constexpr size_t remaining_chunks = residual / 4;
43+
// Unrolled loop for the 4-float blocks
44+
if constexpr (remaining_chunks >= 1) {
45+
L2SquareStep(pVect1, pVect2, sum0);
46+
}
47+
if constexpr (remaining_chunks >= 2) {
48+
L2SquareStep(pVect1, pVect2, sum1);
49+
}
50+
if constexpr (remaining_chunks >= 3) {
51+
L2SquareStep(pVect1, pVect2, sum2);
52+
}
53+
54+
// Handle final residual elements (0-3 elements)
55+
constexpr size_t final_residual = residual % 4;
56+
if constexpr (final_residual > 0) {
57+
float32x4_t v1 = vdupq_n_f32(0.0f);
58+
float32x4_t v2 = vdupq_n_f32(0.0f);
59+
60+
if constexpr (final_residual >= 1) {
61+
v1 = vld1q_lane_f32(pVect1, v1, 0);
62+
v2 = vld1q_lane_f32(pVect2, v2, 0);
63+
}
64+
if constexpr (final_residual >= 2) {
65+
v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
66+
v2 = vld1q_lane_f32(pVect2 + 1, v2, 1);
67+
}
68+
if constexpr (final_residual >= 3) {
69+
v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
70+
v2 = vld1q_lane_f32(pVect2 + 2, v2, 2);
71+
}
72+
73+
float32x4_t diff = vsubq_f32(v1, v2);
74+
sum3 = vmlaq_f32(sum3, diff, diff);
75+
}
76+
77+
// Combine all four sum accumulators
78+
float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
79+
80+
// Horizontal sum of the 4 elements in the combined NEON register
81+
float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
82+
float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
83+
float sum = vget_lane_f32(summed, 0);
84+
85+
return sum;
86+
}

0 commit comments

Comments
 (0)