RedisAI
diff --git a/‎cmake/aarch64InstructionFlags.cmake‎
Lines changed: 22 additions & 0 deletions b/‎cmake/aarch64InstructionFlags.cmake‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 25 additions & 0 deletions b/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_NEON_FP32.h‎
Lines changed: 82 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_NEON_FP32.h‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_SVE_FP32.h‎
Lines changed: 77 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_SVE_FP32.h‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 25 additions & 3 deletions b/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎src/VecSim/spaces/L2/L2_NEON_FP32.h‎
Lines changed: 86 additions & 0 deletions b/‎src/VecSim/spaces/L2/L2_NEON_FP32.h‎
Lines changed: 86 additions & 0 deletions
@@ -0,0 +1,22 @@
+include(CheckCXXCompilerFlag)
+
+
+message(STATUS "Building for ARM aarch64")
+
+# Check what compiler flags are supported
+CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
+CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
+CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
+CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
+
+# Only use ARMv9 if both compiler and CPU support it
+if(CXX_SVE2)
+  message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
+  add_compile_definitions(OPT_SVE2)
+endif()
+if (CXX_ARMV8A OR CXX_ARMV7_NEON)
+  add_compile_definitions(OPT_NEON)
+endif()
+if (CXX_SVE)
+  add_compile_definitions(OPT_SVE)
+endif()
@@ -44,6 +44,31 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	endif()
 endif()
 
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
+	include(${root}/cmake/aarch64InstructionFlags.cmake)
+
+	# Create different optimization implementations for ARM architecture
+	if (CXX_ARMV8A)
+		message("Building with ARMV8A")
+		set_source_files_properties(functions/NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a")
+		list(APPEND OPTIMIZATIONS functions/NEON.cpp)
+	endif()
+
+	# SVE support
+	if (CXX_SVE)
+		message("Building with SVE")
+		set_source_files_properties(functions/SVE.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a+sve")
+		list(APPEND OPTIMIZATIONS functions/SVE.cpp)
+	endif()
+
+	# SVE2 support
+	if (CXX_SVE2)
+		message("Building with ARMV9A and SVE2")
+		set_source_files_properties(functions/SVE2.cpp PROPERTIES COMPILE_FLAGS "-march=armv9-a+sve2")
+		list(APPEND OPTIMIZATIONS functions/SVE2.cpp)
+	endif()
+endif()
+
 # Here we are compiling the space selectors with the relevant optimization flag.
 add_library(VectorSimilaritySpaces
 	space_aux.cpp
 
@@ -0,0 +1,82 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <arm_neon.h>
+
+static inline void InnerProductStep(float *&pVect1, float *&pVect2, float32x4_t &sum) {
+    float32x4_t v1 = vld1q_f32(pVect1);
+    float32x4_t v2 = vld1q_f32(pVect2);
+    sum = vmlaq_f32(sum, v1, v2);
+    pVect1 += 4;
+    pVect2 += 4;
+}
+
+template <unsigned char residual> // 0..15
+float FP32_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    float *pVect2 = (float *)pVect2v;
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        InnerProductStep(pVect1, pVect2, sum0);
+        InnerProductStep(pVect1, pVect2, sum1);
+        InnerProductStep(pVect1, pVect2, sum2);
+        InnerProductStep(pVect1, pVect2, sum3);
+    }
+
+    // Handle remaining complete 4-float blocks within residual
+    constexpr size_t remaining_chunks = residual / 4;
+
+    // Unrolled loop for the 4-float blocks
+    if constexpr (remaining_chunks >= 1) {
+        InnerProductStep(pVect1, pVect2, sum0);
+    }
+    if constexpr (remaining_chunks >= 2) {
+        InnerProductStep(pVect1, pVect2, sum1);
+    }
+    if constexpr (remaining_chunks >= 3) {
+        InnerProductStep(pVect1, pVect2, sum2);
+    }
+
+    // Handle final residual elements (0-3 elements)
+    constexpr size_t final_residual = residual % 4;
+    if constexpr (final_residual > 0) {
+        float32x4_t v1 = vdupq_n_f32(0.0f);
+        float32x4_t v2 = vdupq_n_f32(0.0f);
+
+        if constexpr (final_residual >= 1) {
+            v1 = vld1q_lane_f32(pVect1, v1, 0);
+            v2 = vld1q_lane_f32(pVect2, v2, 0);
+        }
+        if constexpr (final_residual >= 2) {
+            v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
+            v2 = vld1q_lane_f32(pVect2 + 1, v2, 1);
+        }
+        if constexpr (final_residual >= 3) {
+            v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
+            v2 = vld1q_lane_f32(pVect2 + 2, v2, 2);
+        }
+
+        sum3 = vmlaq_f32(sum3, v1, v2);
+    }
+
+    // Combine all four sum accumulators
+    float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
+
+    // Horizontal sum of the 4 elements in the combined NEON register
+    float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
+    float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
+    float sum = vget_lane_f32(summed, 0);
+
+    return 1.0f - sum;
+}
@@ -0,0 +1,77 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+
+#include <arm_sve.h>
+
+static inline void InnerProductStep(float *&pVect1, float *&pVect2, size_t &offset,
+                                    svfloat32_t &sum) {
+    svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset);
+    svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset);
+
+    sum = svmla_f32_x(svptrue_b32(), sum, v1, v2);
+
+    offset += svcntw();
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    float *pVect2 = (float *)pVect2v;
+    size_t offset = 0;
+
+    uint64_t sve_word_count = svcntw();
+
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    auto chunk_size = 4 * sve_word_count;
+    const size_t number_of_chunks = dimension / chunk_size;
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        InnerProductStep(pVect1, pVect2, offset, sum0);
+        InnerProductStep(pVect1, pVect2, offset, sum1);
+        InnerProductStep(pVect1, pVect2, offset, sum2);
+        InnerProductStep(pVect1, pVect2, offset, sum3);
+    }
+
+    // Process remaining complete SVE vectors that didn't fit into the main loop
+    // These are full vector operations (0-3 elements)
+    if constexpr (additional_steps > 0) {
+        if constexpr (additional_steps >= 1) {
+            InnerProductStep(pVect1, pVect2, offset, sum0);
+        }
+        if constexpr (additional_steps >= 2) {
+            InnerProductStep(pVect1, pVect2, offset, sum1);
+        }
+        if constexpr (additional_steps >= 3) {
+            InnerProductStep(pVect1, pVect2, offset, sum3);
+        }
+    }
+
+    // Process final tail elements that don't form a complete vector
+    // This section handles the case when dimension is not evenly divisible by SVE vector length
+    if constexpr (partial_chunk) {
+        // Create a predicate mask where each lane is active only for the remaining elements
+        svbool_t pg =
+            svwhilelt_b32(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
+
+        // Load vectors with predication
+        svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
+        svfloat32_t v2 = svld1_f32(pg, pVect2 + offset);
+        sum3 = svmla_f32_m(pg, sum3, v1, v2);
+    }
+
+    sum0 = svadd_f32_x(svptrue_b32(), sum0, sum1);
+    sum2 = svadd_f32_x(svptrue_b32(), sum2, sum3);
+    // Perform vector addition in parallel
+    svfloat32_t sum_all = svadd_f32_x(svptrue_b32(), sum0, sum2);
+    // Single horizontal reduction at the end
+    float result = svaddv_f32(svptrue_b32(), sum_all);
+    return 1.0f - result;
+}
@@ -9,6 +9,9 @@
 #include "VecSim/spaces/functions/AVX512.h"
 #include "VecSim/spaces/functions/AVX.h"
 #include "VecSim/spaces/functions/SSE.h"
+#include "VecSim/spaces/functions/NEON.h"
+#include "VecSim/spaces/functions/SVE.h"
+#include "VecSim/spaces/functions/SVE2.h"
 
 namespace spaces {
 dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_opt,
@@ -19,13 +22,14 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_
     }
 
     dist_func_t<float> ret_dist_func = FP32_InnerProduct;
+
     // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
     if (dim < 16) {
         return ret_dist_func;
     }
+    switch (arch_opt) {
 #ifdef CPU_FEATURES_ARCH_X86_64
 
-    switch (arch_opt) {
     case ARCH_OPT_AVX512_F:
 #ifdef OPT_AVX512F
         ret_dist_func = Choose_FP32_IP_implementation_AVX512(dim);
@@ -47,11 +51,29 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, const Arch_Optimization arch_
             *alignment = 4 * sizeof(float); // handles 4 floats
         break;
 #endif
+#endif // __x86_64__
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    case ARCH_OPT_SVE2:
+#ifdef OPT_SVE2
+        ret_dist_func = Choose_FP32_IP_implementation_SVE2(dim);
+        break;
+
+#endif
+    case ARCH_OPT_SVE:
+#ifdef OPT_SVE
+        ret_dist_func = Choose_FP32_IP_implementation_SVE(dim);
+        break;
+
+#endif
+    case ARCH_OPT_NEON:
+#ifdef OPT_NEON
+        ret_dist_func = Choose_FP32_IP_implementation_NEON(dim);
+        break;
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
     case ARCH_OPT_NONE:
         break;
     } // switch
-
-#endif // __x86_64__
     return ret_dist_func;
 }
 
 
@@ -0,0 +1,86 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <arm_neon.h>
+
+static inline void L2SquareStep(float *&pVect1, float *&pVect2, float32x4_t &sum) {
+    float32x4_t v1 = vld1q_f32(pVect1);
+    float32x4_t v2 = vld1q_f32(pVect2);
+
+    float32x4_t diff = vsubq_f32(v1, v2);
+
+    sum = vmlaq_f32(sum, diff, diff);
+
+    pVect1 += 4;
+    pVect2 += 4;
+}
+
+template <unsigned char residual> // 0..15
+float FP32_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    float *pVect2 = (float *)pVect2v;
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        L2SquareStep(pVect1, pVect2, sum0);
+        L2SquareStep(pVect1, pVect2, sum1);
+        L2SquareStep(pVect1, pVect2, sum2);
+        L2SquareStep(pVect1, pVect2, sum3);
+    }
+
+    // Handle remaining complete 4-float blocks within residual
+    constexpr size_t remaining_chunks = residual / 4;
+    // Unrolled loop for the 4-float blocks
+    if constexpr (remaining_chunks >= 1) {
+        L2SquareStep(pVect1, pVect2, sum0);
+    }
+    if constexpr (remaining_chunks >= 2) {
+        L2SquareStep(pVect1, pVect2, sum1);
+    }
+    if constexpr (remaining_chunks >= 3) {
+        L2SquareStep(pVect1, pVect2, sum2);
+    }
+
+    // Handle final residual elements (0-3 elements)
+    constexpr size_t final_residual = residual % 4;
+    if constexpr (final_residual > 0) {
+        float32x4_t v1 = vdupq_n_f32(0.0f);
+        float32x4_t v2 = vdupq_n_f32(0.0f);
+
+        if constexpr (final_residual >= 1) {
+            v1 = vld1q_lane_f32(pVect1, v1, 0);
+            v2 = vld1q_lane_f32(pVect2, v2, 0);
+        }
+        if constexpr (final_residual >= 2) {
+            v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
+            v2 = vld1q_lane_f32(pVect2 + 1, v2, 1);
+        }
+        if constexpr (final_residual >= 3) {
+            v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
+            v2 = vld1q_lane_f32(pVect2 + 2, v2, 2);
+        }
+
+        float32x4_t diff = vsubq_f32(v1, v2);
+        sum3 = vmlaq_f32(sum3, diff, diff);
+    }
+
+    // Combine all four sum accumulators
+    float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
+
+    // Horizontal sum of the 4 elements in the combined NEON register
+    float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
+    float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
+    float sum = vget_lane_f32(summed, 0);
+
+    return sum;
+}