RedisAI
diff --git a/‎.github/workflows/flow-temp.yml‎
Lines changed: 15 additions & 2 deletions b/‎.github/workflows/flow-temp.yml‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎.install/rocky_linux_8.sh‎
Lines changed: 3 additions & 3 deletions b/‎.install/rocky_linux_8.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.install/rocky_linux_9.sh‎
Lines changed: 4 additions & 1 deletion b/‎.install/rocky_linux_9.sh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.install/ubuntu_20.04.sh‎
Lines changed: 3 additions & 2 deletions b/‎.install/ubuntu_20.04.sh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.install/ubuntu_22.04.sh‎
Lines changed: 4 additions & 1 deletion b/‎.install/ubuntu_22.04.sh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cmake/x86_64InstructionFlags.cmake‎
Lines changed: 5 additions & 0 deletions b/‎cmake/x86_64InstructionFlags.cmake‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 9 additions & 3 deletions b/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX512FP16_FP16.h‎
Lines changed: 49 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_AVX512FP16_FP16.h‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX512_FP16.h‎ ‎src/VecSim/spaces/IP/IP_AVX512F_FP16.h‎src/VecSim/spaces/IP/IP_AVX512_FP16.h renamed to src/VecSim/spaces/IP/IP_AVX512F_FP16.h b/‎src/VecSim/spaces/IP/IP_AVX512_FP16.h‎ ‎src/VecSim/spaces/IP/IP_AVX512F_FP16.h‎src/VecSim/spaces/IP/IP_AVX512_FP16.h renamed to src/VecSim/spaces/IP/IP_AVX512F_FP16.h
diff --git a/‎src/VecSim/spaces/IP/IP_AVX512_FP32.h‎ ‎src/VecSim/spaces/IP/IP_AVX512F_FP32.h‎src/VecSim/spaces/IP/IP_AVX512_FP32.h renamed to src/VecSim/spaces/IP/IP_AVX512F_FP32.h b/‎src/VecSim/spaces/IP/IP_AVX512_FP32.h‎ ‎src/VecSim/spaces/IP/IP_AVX512F_FP32.h‎src/VecSim/spaces/IP/IP_AVX512_FP32.h renamed to src/VecSim/spaces/IP/IP_AVX512F_FP32.h
@@ -11,8 +11,21 @@ on:
   push:
     branches-ignore: ['**'] # ignore all branches. Comment this line to run your workflow below on every push.
 jobs:
-  bionic:
+  rocky8:
     uses: ./.github/workflows/task-unit-test.yml
     with:
-      container: ubuntu:bionic
+      container: rockylinux:8
       run-valgrind: false
+      run-codecov: false
+  rocky9:
+    uses: ./.github/workflows/task-unit-test.yml
+    with:
+      container: rockylinux:9
+      run-valgrind: false
+      run-codecov: false
+  focal:
+    uses: ./.github/workflows/task-unit-test.yml
+    with:
+      container: ubuntu:focal
+      run-valgrind: false
+      run-codecov: true
@@ -11,11 +11,11 @@ $MODE dnf groupinstall "Development Tools" -yqq
 # powertools is needed to install epel
 $MODE dnf config-manager --set-enabled powertools
 
-# get epel to install gcc11
+# get epel to install gcc13
 $MODE dnf install epel-release -yqq
 
-$MODE dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ gcc-toolset-11-libatomic-devel  make valgrind wget git
+$MODE dnf install -y gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ gcc-toolset-13-libatomic-devel  make valgrind wget git
 
-cp /opt/rh/gcc-toolset-11/enable /etc/profile.d/gcc-toolset-11.sh
+cp /opt/rh/gcc-toolset-13/enable /etc/profile.d/gcc-toolset-13.sh
 
 source install_cmake.sh $MODE
@@ -3,5 +3,8 @@ MODE=$1 # whether to install using sudo or not
 set -e
 export DEBIAN_FRONTEND=noninteractive
 $MODE dnf update -y
-$MODE dnf install -y gcc gcc-c++ make wget git valgrind
+$MODE dnf install -y gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ make wget git valgrind
+
+cp /opt/rh/gcc-toolset-13/enable /etc/profile.d/gcc-toolset-13.sh
+
 source install_cmake.sh $MODE
@@ -7,7 +7,8 @@ $MODE apt-get update -qq
 $MODE apt install -yqq software-properties-common
 $MODE add-apt-repository ppa:ubuntu-toolchain-r/test -y
 $MODE apt update
-$MODE apt-get install -yqq wget gcc-11 g++-11 make clang-format gcc valgrind python3-pip lcov git
+$MODE apt-get install -yqq wget gcc-11 g++-11 make clang-format valgrind python3-pip lcov git
 $MODE update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11
-
+# align gcov version with gcc version
+update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 60
 source install_cmake.sh $MODE
@@ -4,5 +4,8 @@ export DEBIAN_FRONTEND=noninteractive
 MODE=$1 # whether to install using sudo or not
 
 $MODE apt-get update -qq || true
-$MODE apt-get install -yqq git wget build-essential valgrind lcov
+$MODE apt-get install -yqq gcc-12 g++-12 git wget build-essential valgrind lcov
+$MODE update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 60 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+# align gcov version with gcc version
+update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-12 60
 source install_cmake.sh $MODE
@@ -11,6 +11,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	CHECK_CXX_COMPILER_FLAG(-mavx512bf16 CXX_AVX512BF16)
 	CHECK_CXX_COMPILER_FLAG(-mavx512bw CXX_AVX512BW)
 	CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2)
+	CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16)
 	CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
 	CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
 	CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
@@ -35,6 +36,10 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		add_compile_definitions(OPT_AVX512_BF16_VL)
 	endif()
 
+	if(CXX_AVX512FP16)
+		add_compile_definitions(OPT_AVX512_FP16)
+	endif()
+
 	if(CXX_AVX512F)
 		add_compile_definitions(OPT_AVX512F)
 	endif()
 
@@ -26,16 +26,22 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX512BF16_VL.cpp)
 	endif()
 
+	if(CXX_AVX512FP16)
+		message("Building with AVX512FP16")
+		set_source_files_properties(functions/AVX512FP16.cpp PROPERTIES COMPILE_FLAGS "-mavx512fp16")
+		list(APPEND OPTIMIZATIONS functions/AVX512FP16.cpp)
+	endif()
+
 	if(CXX_AVX512BW AND CXX_AVX512VBMI2)
 		message("Building with AVX512BW and AVX512VBMI2")
 		set_source_files_properties(functions/AVX512BW_VBMI2.cpp PROPERTIES COMPILE_FLAGS "-mavx512bw  -mavx512vbmi2")
 		list(APPEND OPTIMIZATIONS functions/AVX512BW_VBMI2.cpp)
 	endif()
 
 	if(CXX_AVX512F)
-		message("Building with AVX512")
-		set_source_files_properties(functions/AVX512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f")
-		list(APPEND OPTIMIZATIONS functions/AVX512.cpp)
+		message("Building with AVX512F")
+		set_source_files_properties(functions/AVX512F.cpp PROPERTIES COMPILE_FLAGS "-mavx512f")
+		list(APPEND OPTIMIZATIONS functions/AVX512F.cpp)
 	endif()
 
 	if(CXX_AVX2)
 
@@ -0,0 +1,49 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <cstdint>
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/float16.h"
+#include <cstring>
+
+using float16 = vecsim_types::float16;
+
+static void InnerProductStep(float16 *&pVect1, float16 *&pVect2, __m512h &sum) {
+    __m512h v1 = _mm512_loadu_ph(pVect1);
+    __m512h v2 = _mm512_loadu_ph(pVect2);
+
+    sum = _mm512_fmadd_ph(v1, v2, sum);
+    pVect1 += 32;
+    pVect2 += 32;
+}
+
+template <unsigned short residual> // 0..31
+float FP16_InnerProductSIMD32_AVX512FP16(const void *pVect1v, const void *pVect2v,
+                                         size_t dimension) {
+    auto *pVect1 = (float16 *)pVect1v;
+    auto *pVect2 = (float16 *)pVect2v;
+
+    const float16 *pEnd1 = pVect1 + dimension;
+
+    __m512h sum = _mm512_setzero_ph();
+
+    if constexpr (residual) {
+        constexpr __mmask32 mask = (1LU << residual) - 1;
+        __m512h v1 = _mm512_loadu_ph(pVect1);
+        pVect1 += residual;
+        __m512h v2 = _mm512_loadu_ph(pVect2);
+        pVect2 += residual;
+        sum = _mm512_maskz_mul_ph(mask, v1, v2);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 32 16-bit floats.
+    do {
+        InnerProductStep(pVect1, pVect2, sum);
+    } while (pVect1 < pEnd1);
+
+    _Float16 res = _mm512_reduce_add_ph(sum);
+    return _Float16(1) - res;
+}