Skip to content

Commit c7f7089

Browse files
[0.8] [MOD - 6778] AVX512_FP16 distance functions (#489)
[MOD - 6778] AVX512_FP16 distance functions (#477) * change avx512 to avx512f to set the ground for avx512_fp16 (which is already used for avx512f for fp16) * format * more fixes * add OPT_AVX512_FP16 macro if supported IP & L2 : choose OPT_AVX512_FP16 if supported add unit tests for fp16 with advanced flag (seperate test as the baseline calculation is different) fp16 Benchmark: all of these changes will be moved to a new PR Duplicated bm class and made it templated only for fp16 so we can control the conversion function in SetUp also added high dimension bm to fp16 * format * install gcc12 ubuntu22 * set gcc12 as default * merge with the bm spaces small refactor * choose OPT_AVX512_FP16 depending on the dim IP: 500 L2: 440 * Upgrade gcc in rocky 8 (12) + 9 (13) * enable flow temp * rocky8 and ubuntu22: install gcc 13 * fix temp flow * jammy: revedrt to gcc12 * align gcov version with gcc version temp flow runcodcov on focal (before updating gcov version) * ubuntu20 align gcov version with gcc version * fix cast to avoid warning * add test for fp16 native type comparing to vecsim allow 1% error in fp16 advanced tests add explicit to float16 constructor to protected from implicit casts. * fix comment disable flow temp * use explicit ctor for bfloat to avoid implicit casts * Apply suggestions from code review Co-authored-by: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Co-authored-by: DvirDukhan <dvir@redis.com> * use explicit ctor for bfloat to avoid implicit casts * IP: mask mul instead of mask load as it is faster --------- Co-authored-by: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Co-authored-by: DvirDukhan <dvir@redis.com> (cherry picked from commit 9c9b7ac) Co-authored-by: meiravgri <109056284+meiravgri@users.noreply.github.com>
1 parent 45eb411 commit c7f7089

36 files changed

+441
-91
lines changed

.github/workflows/flow-temp.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,21 @@ on:
1111
push:
1212
branches-ignore: ['**'] # ignore all branches. Comment this line to run your workflow below on every push.
1313
jobs:
14-
bionic:
14+
rocky8:
1515
uses: ./.github/workflows/task-unit-test.yml
1616
with:
17-
container: ubuntu:bionic
17+
container: rockylinux:8
1818
run-valgrind: false
19+
run-codecov: false
20+
rocky9:
21+
uses: ./.github/workflows/task-unit-test.yml
22+
with:
23+
container: rockylinux:9
24+
run-valgrind: false
25+
run-codecov: false
26+
focal:
27+
uses: ./.github/workflows/task-unit-test.yml
28+
with:
29+
container: ubuntu:focal
30+
run-valgrind: false
31+
run-codecov: true

.install/rocky_linux_8.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ $MODE dnf groupinstall "Development Tools" -yqq
1111
# powertools is needed to install epel
1212
$MODE dnf config-manager --set-enabled powertools
1313

14-
# get epel to install gcc11
14+
# get epel to install gcc13
1515
$MODE dnf install epel-release -yqq
1616

17-
$MODE dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ gcc-toolset-11-libatomic-devel make valgrind wget git
17+
$MODE dnf install -y gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ gcc-toolset-13-libatomic-devel make valgrind wget git
1818

19-
cp /opt/rh/gcc-toolset-11/enable /etc/profile.d/gcc-toolset-11.sh
19+
cp /opt/rh/gcc-toolset-13/enable /etc/profile.d/gcc-toolset-13.sh
2020

2121
source install_cmake.sh $MODE

.install/rocky_linux_9.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,8 @@ MODE=$1 # whether to install using sudo or not
33
set -e
44
export DEBIAN_FRONTEND=noninteractive
55
$MODE dnf update -y
6-
$MODE dnf install -y gcc gcc-c++ make wget git valgrind
6+
$MODE dnf install -y gcc-toolset-13-gcc gcc-toolset-13-gcc-c++ make wget git valgrind
7+
8+
cp /opt/rh/gcc-toolset-13/enable /etc/profile.d/gcc-toolset-13.sh
9+
710
source install_cmake.sh $MODE

.install/ubuntu_20.04.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ $MODE apt-get update -qq
77
$MODE apt install -yqq software-properties-common
88
$MODE add-apt-repository ppa:ubuntu-toolchain-r/test -y
99
$MODE apt update
10-
$MODE apt-get install -yqq wget gcc-11 g++-11 make clang-format gcc valgrind python3-pip lcov git
10+
$MODE apt-get install -yqq wget gcc-11 g++-11 make clang-format valgrind python3-pip lcov git
1111
$MODE update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11
12-
12+
# align gcov version with gcc version
13+
update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 60
1314
source install_cmake.sh $MODE

.install/ubuntu_22.04.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,8 @@ export DEBIAN_FRONTEND=noninteractive
44
MODE=$1 # whether to install using sudo or not
55

66
$MODE apt-get update -qq || true
7-
$MODE apt-get install -yqq git wget build-essential valgrind lcov
7+
$MODE apt-get install -yqq gcc-12 g++-12 git wget build-essential valgrind lcov
8+
$MODE update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 60 --slave /usr/bin/g++ g++ /usr/bin/g++-12
9+
# align gcov version with gcc version
10+
update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-12 60
811
source install_cmake.sh $MODE

cmake/x86_64InstructionFlags.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
1111
CHECK_CXX_COMPILER_FLAG(-mavx512bf16 CXX_AVX512BF16)
1212
CHECK_CXX_COMPILER_FLAG(-mavx512bw CXX_AVX512BW)
1313
CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2)
14+
CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16)
1415
CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
1516
CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
1617
CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
@@ -35,6 +36,10 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
3536
add_compile_definitions(OPT_AVX512_BF16_VL)
3637
endif()
3738

39+
if(CXX_AVX512FP16)
40+
add_compile_definitions(OPT_AVX512_FP16)
41+
endif()
42+
3843
if(CXX_AVX512F)
3944
add_compile_definitions(OPT_AVX512F)
4045
endif()

src/VecSim/spaces/CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,22 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
2626
list(APPEND OPTIMIZATIONS functions/AVX512BF16_VL.cpp)
2727
endif()
2828

29+
if(CXX_AVX512FP16)
30+
message("Building with AVX512FP16")
31+
set_source_files_properties(functions/AVX512FP16.cpp PROPERTIES COMPILE_FLAGS "-mavx512fp16")
32+
list(APPEND OPTIMIZATIONS functions/AVX512FP16.cpp)
33+
endif()
34+
2935
if(CXX_AVX512BW AND CXX_AVX512VBMI2)
3036
message("Building with AVX512BW and AVX512VBMI2")
3137
set_source_files_properties(functions/AVX512BW_VBMI2.cpp PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512vbmi2")
3238
list(APPEND OPTIMIZATIONS functions/AVX512BW_VBMI2.cpp)
3339
endif()
3440

3541
if(CXX_AVX512F)
36-
message("Building with AVX512")
37-
set_source_files_properties(functions/AVX512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f")
38-
list(APPEND OPTIMIZATIONS functions/AVX512.cpp)
42+
message("Building with AVX512F")
43+
set_source_files_properties(functions/AVX512F.cpp PROPERTIES COMPILE_FLAGS "-mavx512f")
44+
list(APPEND OPTIMIZATIONS functions/AVX512F.cpp)
3945
endif()
4046

4147
if(CXX_AVX2)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include <cstdint>
8+
#include "VecSim/spaces/space_includes.h"
9+
#include "VecSim/types/float16.h"
10+
#include <cstring>
11+
12+
using float16 = vecsim_types::float16;
13+
14+
static void InnerProductStep(float16 *&pVect1, float16 *&pVect2, __m512h &sum) {
15+
__m512h v1 = _mm512_loadu_ph(pVect1);
16+
__m512h v2 = _mm512_loadu_ph(pVect2);
17+
18+
sum = _mm512_fmadd_ph(v1, v2, sum);
19+
pVect1 += 32;
20+
pVect2 += 32;
21+
}
22+
23+
template <unsigned short residual> // 0..31
24+
float FP16_InnerProductSIMD32_AVX512FP16(const void *pVect1v, const void *pVect2v,
25+
size_t dimension) {
26+
auto *pVect1 = (float16 *)pVect1v;
27+
auto *pVect2 = (float16 *)pVect2v;
28+
29+
const float16 *pEnd1 = pVect1 + dimension;
30+
31+
__m512h sum = _mm512_setzero_ph();
32+
33+
if constexpr (residual) {
34+
constexpr __mmask32 mask = (1LU << residual) - 1;
35+
__m512h v1 = _mm512_loadu_ph(pVect1);
36+
pVect1 += residual;
37+
__m512h v2 = _mm512_loadu_ph(pVect2);
38+
pVect2 += residual;
39+
sum = _mm512_maskz_mul_ph(mask, v1, v2);
40+
}
41+
42+
// We dealt with the residual part. We are left with some multiple of 32 16-bit floats.
43+
do {
44+
InnerProductStep(pVect1, pVect2, sum);
45+
} while (pVect1 < pEnd1);
46+
47+
_Float16 res = _mm512_reduce_add_ph(sum);
48+
return _Float16(1) - res;
49+
}

0 commit comments

Comments
 (0)