add constexpr to conditions where possible (ifs that test the residual which is the template paramter) (#484)

github-actions[bot] · meiravgri · web-flow · commit 9b57318bc854 · 2024-07-04T14:18:22.000+03:00
use fmadd in avx512 if possible (in avx2 it requires another flag) (cherry picked from commit 60b290a) Co-authored-by: meiravgri <meirav.grimberg@redis.com>
diff --git a/src/VecSim/spaces/IP/IP_AVX2_BF16.h b/src/VecSim/spaces/IP/IP_AVX2_BF16.h
@@ -108,7 +108,7 @@ float BF16_InnerProductSIMD32_AVX2(const void *pVect1v, const void *pVect2v, siz
     }
 
     // Do a single step if residual >=16
-    if (residual >= 16) {
+    if constexpr (residual >= 16) {
         InnerProductStep(pVect1, pVect2, sum_prod);
     }
 
diff --git a/src/VecSim/spaces/IP/IP_AVX512_FP16.h b/src/VecSim/spaces/IP/IP_AVX512_FP16.h
@@ -31,7 +31,7 @@ float FP16_InnerProductSIMD32_AVX512(const void *pVect1v, const void *pVect2v, s
 
     auto sum = _mm512_setzero_ps();
 
-    if (residual % 16) {
+    if constexpr (residual % 16) {
         // Deal with remainder first. `dim` is more than 32, so we have at least one block of 32
         // 16-bit float so mask loading is guaranteed to be safe.
         __mmask16 constexpr residuals_mask = (1 << (residual % 16)) - 1;
@@ -46,7 +46,7 @@ float FP16_InnerProductSIMD32_AVX512(const void *pVect1v, const void *pVect2v, s
         pVect1 += residual % 16;
         pVect2 += residual % 16;
     }
-    if (residual >= 16) {
+    if constexpr (residual >= 16) {
         InnerProductStep(pVect1, pVect2, sum);
     }
 
diff --git a/src/VecSim/spaces/IP/IP_AVX512_FP32.h b/src/VecSim/spaces/IP/IP_AVX512_FP32.h
@@ -11,7 +11,7 @@ static inline void InnerProductStep(float *&pVect1, float *&pVect2, __m512 &sum5
     pVect1 += 16;
     __m512 v2 = _mm512_loadu_ps(pVect2);
     pVect2 += 16;
-    sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
+    sum512 = _mm512_fmadd_ps(v1, v2, sum512);
 }
 
 template <unsigned char residual> // 0..15
@@ -25,7 +25,7 @@ float FP32_InnerProductSIMD16_AVX512(const void *pVect1v, const void *pVect2v, s
 
     // Deal with remainder first. `dim` is more than 16, so we have at least one 16-float block,
     // so mask loading is guaranteed to be safe
-    if (residual) {
+    if constexpr (residual) {
         __mmask16 constexpr mask = (1 << residual) - 1;
         __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1);
         pVect1 += residual;
diff --git a/src/VecSim/spaces/IP/IP_AVX512_FP64.h b/src/VecSim/spaces/IP/IP_AVX512_FP64.h
@@ -11,7 +11,7 @@ static inline void InnerProductStep(double *&pVect1, double *&pVect2, __m512d &s
     pVect1 += 8;
     __m512d v2 = _mm512_loadu_pd(pVect2);
     pVect2 += 8;
-    sum512 = _mm512_add_pd(sum512, _mm512_mul_pd(v1, v2));
+    sum512 = _mm512_fmadd_pd(v1, v2, sum512);
 }
 
 template <unsigned char residual> // 0..7
@@ -25,7 +25,7 @@ double FP64_InnerProductSIMD8_AVX512(const void *pVect1v, const void *pVect2v, s
 
     // Deal with remainder first. `dim` is more than 8, so we have at least one 8-double block,
     // so mask loading is guaranteed to be safe
-    if (residual) {
+    if constexpr (residual) {
         __mmask8 constexpr mask = (1 << residual) - 1;
         __m512d v1 = _mm512_maskz_loadu_pd(mask, pVect1);
         pVect1 += residual;
diff --git a/src/VecSim/spaces/IP/IP_AVX_FP32.h b/src/VecSim/spaces/IP/IP_AVX_FP32.h
@@ -26,7 +26,7 @@ float FP32_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size
 
     // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
     // 16-float block, so mask loading is guaranteed to be safe.
-    if (residual % 8) {
+    if constexpr (residual % 8) {
         __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
@@ -36,7 +36,7 @@ float FP32_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size
     }
 
     // If the reminder is >=8, have another step of 8 floats
-    if (residual >= 8) {
+    if constexpr (residual >= 8) {
         InnerProductStep(pVect1, pVect2, sum256);
     }
 
diff --git a/src/VecSim/spaces/IP/IP_AVX_FP64.h b/src/VecSim/spaces/IP/IP_AVX_FP64.h
@@ -26,7 +26,7 @@ double FP64_InnerProductSIMD8_AVX(const void *pVect1v, const void *pVect2v, size
 
     // Deal with 1-3 doubles with mask loading, if needed. `dim` is >8, so we have at least one
     // 8-double block, so mask loading is guaranteed to be safe.
-    if (residual % 4) {
+    if constexpr (residual % 4) {
         // _mm256_maskz_loadu_pd is not available in AVX
         __mmask8 constexpr mask = (1 << (residual % 4)) - 1;
         __m256d v1 = my_mm256_maskz_loadu_pd<mask>(pVect1);
@@ -37,7 +37,7 @@ double FP64_InnerProductSIMD8_AVX(const void *pVect1v, const void *pVect2v, size
     }
 
     // If the reminder is >=4, have another step of 4 doubles
-    if (residual >= 4) {
+    if constexpr (residual >= 4) {
         InnerProductStep(pVect1, pVect2, sum256);
     }
 
diff --git a/src/VecSim/spaces/IP/IP_F16C_FP16.h b/src/VecSim/spaces/IP/IP_F16C_FP16.h
@@ -31,7 +31,7 @@ float FP16_InnerProductSIMD32_F16C(const void *pVect1v, const void *pVect2v, siz
 
     auto sum = _mm256_setzero_ps();
 
-    if (residual % 8) {
+    if constexpr (residual % 8) {
         // Deal with remainder first. `dim` is more than 32, so we have at least one block of 32
         // 16-bit float so mask loading is guaranteed to be safe.
         __mmask16 constexpr residuals_mask = (1 << (residual % 8)) - 1;
@@ -47,12 +47,12 @@ float FP16_InnerProductSIMD32_F16C(const void *pVect1v, const void *pVect2v, siz
         pVect1 += residual % 8;
         pVect2 += residual % 8;
     }
-    if (residual >= 8 && residual < 16) {
+    if constexpr (residual >= 8 && residual < 16) {
         InnerProductStep(pVect1, pVect2, sum);
-    } else if (residual >= 16 && residual < 24) {
+    } else if constexpr (residual >= 16 && residual < 24) {
         InnerProductStep(pVect1, pVect2, sum);
         InnerProductStep(pVect1, pVect2, sum);
-    } else if (residual >= 24) {
+    } else if constexpr (residual >= 24) {
         InnerProductStep(pVect1, pVect2, sum);
         InnerProductStep(pVect1, pVect2, sum);
         InnerProductStep(pVect1, pVect2, sum);
diff --git a/src/VecSim/spaces/IP/IP_SSE_FP32.h b/src/VecSim/spaces/IP/IP_SSE_FP32.h
@@ -25,19 +25,19 @@ float FP32_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size
 
     // Deal with %4 remainder first. `dim` is >16, so we have at least one 16-float block,
     // so loading 4 floats and then masking them is safe.
-    if (residual % 4) {
+    if constexpr (residual % 4) {
         __m128 v1, v2;
-        if (residual % 4 == 3) {
+        if constexpr (residual % 4 == 3) {
             // Load 3 floats and set the last one to 0
             v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0
             v2 = _mm_load_ss(pVect2);
             v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1));
             v2 = _mm_loadh_pi(v2, (__m64 *)(pVect2 + 1));
-        } else if (residual % 4 == 2) {
+        } else if constexpr (residual % 4 == 2) {
             // Load 2 floats and set the last two to 0
             v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
             v2 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect2);
-        } else if (residual % 4 == 1) {
+        } else if constexpr (residual % 4 == 1) {
             // Load 1 float and set the last three to 0
             v1 = _mm_load_ss(pVect1);
             v2 = _mm_load_ss(pVect2);
@@ -48,11 +48,11 @@ float FP32_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size
     }
 
     // have another 1, 2 or 3 4-float steps according to residual
-    if (residual >= 12)
+    if constexpr (residual >= 12)
         InnerProductStep(pVect1, pVect2, sum_prod);
-    if (residual >= 8)
+    if constexpr (residual >= 8)
         InnerProductStep(pVect1, pVect2, sum_prod);
-    if (residual >= 4)
+    if constexpr (residual >= 4)
         InnerProductStep(pVect1, pVect2, sum_prod);
 
     // We dealt with the residual part. We are left with some multiple of 16 floats.
diff --git a/src/VecSim/spaces/IP/IP_SSE_FP64.h b/src/VecSim/spaces/IP/IP_SSE_FP64.h
@@ -25,7 +25,7 @@ double FP64_InnerProductSIMD8_SSE(const void *pVect1v, const void *pVect2v, size
     __m128d sum_prod = _mm_setzero_pd();
 
     // If residual is odd, we load 1 double and set the last one to 0
-    if (residual % 2 == 1) {
+    if constexpr (residual % 2 == 1) {
         __m128d v1 = _mm_load_sd(pVect1);
         pVect1++;
         __m128d v2 = _mm_load_sd(pVect2);
@@ -34,11 +34,11 @@ double FP64_InnerProductSIMD8_SSE(const void *pVect1v, const void *pVect2v, size
     }
 
     // have another 1, 2 or 3 2-double steps according to residual
-    if (residual >= 6)
+    if constexpr (residual >= 6)
         InnerProductStep(pVect1, pVect2, sum_prod);
-    if (residual >= 4)
+    if constexpr (residual >= 4)
         InnerProductStep(pVect1, pVect2, sum_prod);
-    if (residual >= 2)
+    if constexpr (residual >= 2)
         InnerProductStep(pVect1, pVect2, sum_prod);
 
     // We dealt with the residual part. We are left with some multiple of 8 doubles.
diff --git a/src/VecSim/spaces/L2/L2_AVX2_BF16.h b/src/VecSim/spaces/L2/L2_AVX2_BF16.h
@@ -106,7 +106,7 @@ float BF16_L2SqrSIMD32_AVX2(const void *pVect1v, const void *pVect2v, size_t dim
     }
 
     // Do a single step if residual >=16
-    if (residual >= 16) {
+    if constexpr (residual >= 16) {
         L2SqrStep(pVect1, pVect2, sum);
     }
 
diff --git a/src/VecSim/spaces/L2/L2_AVX512_FP16.h b/src/VecSim/spaces/L2/L2_AVX512_FP16.h
@@ -32,7 +32,7 @@ float FP16_L2SqrSIMD32_AVX512(const void *pVect1v, const void *pVect2v, size_t d
 
     auto sum = _mm512_setzero_ps();
 
-    if (residual % 16) {
+    if constexpr (residual % 16) {
         // Deal with remainder first. `dim` is more than 32, so we have at least one block of 32
         // 16-bit float so mask loading is guaranteed to be safe.
         __mmask16 constexpr residuals_mask = (1 << (residual % 16)) - 1;
@@ -48,7 +48,7 @@ float FP16_L2SqrSIMD32_AVX512(const void *pVect1v, const void *pVect2v, size_t d
         pVect1 += residual % 16;
         pVect2 += residual % 16;
     }
-    if (residual >= 16) {
+    if constexpr (residual >= 16) {
         L2SqrStep(pVect1, pVect2, sum);
     }
 
diff --git a/src/VecSim/spaces/L2/L2_AVX512_FP32.h b/src/VecSim/spaces/L2/L2_AVX512_FP32.h
@@ -12,8 +12,8 @@ static inline void L2SqrStep(float *&pVect1, float *&pVect2, __m512 &sum) {
     __m512 v2 = _mm512_loadu_ps(pVect2);
     pVect2 += 16;
     __m512 diff = _mm512_sub_ps(v1, v2);
-    // sum = _mm512_fmadd_ps(diff, diff, sum);
-    sum = _mm512_add_ps(sum, _mm512_mul_ps(diff, diff));
+
+    sum = _mm512_fmadd_ps(diff, diff, sum);
 }
 
 template <unsigned char residual> // 0..15
@@ -27,7 +27,7 @@ float FP32_L2SqrSIMD16_AVX512(const void *pVect1v, const void *pVect2v, size_t d
 
     // Deal with remainder first. `dim` is more than 16, so we have at least one 16-float block,
     // so mask loading is guaranteed to be safe
-    if (residual) {
+    if constexpr (residual) {
         __mmask16 constexpr mask = (1 << residual) - 1;
         __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1);
         pVect1 += residual;
diff --git a/src/VecSim/spaces/L2/L2_AVX512_FP64.h b/src/VecSim/spaces/L2/L2_AVX512_FP64.h
@@ -12,8 +12,8 @@ static inline void L2SqrStep(double *&pVect1, double *&pVect2, __m512d &sum) {
     __m512d v2 = _mm512_loadu_pd(pVect2);
     pVect2 += 8;
     __m512d diff = _mm512_sub_pd(v1, v2);
-    // sum = _mm512_fmadd_pd(diff, diff, sum);
-    sum = _mm512_add_pd(sum, _mm512_mul_pd(diff, diff));
+
+    sum = _mm512_fmadd_pd(diff, diff, sum);
 }
 
 template <unsigned char residual> // 0..7
@@ -27,7 +27,7 @@ double FP64_L2SqrSIMD8_AVX512(const void *pVect1v, const void *pVect2v, size_t d
 
     // Deal with remainder first. `dim` is more than 8, so we have at least one 8-double block,
     // so mask loading is guaranteed to be safe
-    if (residual) {
+    if constexpr (residual) {
         __mmask8 constexpr mask = (1 << residual) - 1;
         __m512d v1 = _mm512_maskz_loadu_pd(mask, pVect1);
         pVect1 += residual;
diff --git a/src/VecSim/spaces/L2/L2_AVX_FP32.h b/src/VecSim/spaces/L2/L2_AVX_FP32.h
@@ -27,7 +27,7 @@ float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dime
     __m256 sum = _mm256_setzero_ps();
 
     // Deal with 1-7 floats with mask loading, if needed
-    if (residual % 8) {
+    if constexpr (residual % 8) {
         __mmask8 constexpr mask8 = (1 << (residual % 8)) - 1;
         __m256 v1 = my_mm256_maskz_loadu_ps<mask8>(pVect1);
         pVect1 += residual % 8;
@@ -38,7 +38,7 @@ float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dime
     }
 
     // If the reminder is >=8, have another step of 8 floats
-    if (residual >= 8) {
+    if constexpr (residual >= 8) {
         L2SqrStep(pVect1, pVect2, sum);
     }
 
diff --git a/src/VecSim/spaces/L2/L2_AVX_FP64.h b/src/VecSim/spaces/L2/L2_AVX_FP64.h
@@ -27,7 +27,7 @@ double FP64_L2SqrSIMD8_AVX(const void *pVect1v, const void *pVect2v, size_t dime
     __m256d sum = _mm256_setzero_pd();
 
     // Deal with 1-3 doubles with mask loading, if needed
-    if (residual % 4) {
+    if constexpr (residual % 4) {
         // _mm256_maskz_loadu_pd is not available in AVX
         __mmask8 constexpr mask4 = (1 << (residual % 4)) - 1;
         __m256d v1 = my_mm256_maskz_loadu_pd<mask4>(pVect1);
@@ -39,7 +39,7 @@ double FP64_L2SqrSIMD8_AVX(const void *pVect1v, const void *pVect2v, size_t dime
     }
 
     // If the reminder is >=4, have another step of 4 doubles
-    if (residual >= 4) {
+    if constexpr (residual >= 4) {
         L2SqrStep(pVect1, pVect2, sum);
     }
 
diff --git a/src/VecSim/spaces/L2/L2_F16C_FP16.h b/src/VecSim/spaces/L2/L2_F16C_FP16.h
@@ -32,7 +32,7 @@ float FP16_L2SqrSIMD32_F16C(const void *pVect1v, const void *pVect2v, size_t dim
 
     auto sum = _mm256_setzero_ps();
 
-    if (residual % 8) {
+    if constexpr (residual % 8) {
         // Deal with remainder first. `dim` is more than 32, so we have at least one block of 32
         // 16-bit float so mask loading is guaranteed to be safe.
         __mmask16 constexpr residuals_mask = (1 << (residual % 8)) - 1;
@@ -50,12 +50,12 @@ float FP16_L2SqrSIMD32_F16C(const void *pVect1v, const void *pVect2v, size_t dim
         pVect1 += residual % 8;
         pVect2 += residual % 8;
     }
-    if (residual >= 8 && residual < 16) {
+    if constexpr (residual >= 8 && residual < 16) {
         L2SqrStep(pVect1, pVect2, sum);
-    } else if (residual >= 16 && residual < 24) {
+    } else if constexpr (residual >= 16 && residual < 24) {
         L2SqrStep(pVect1, pVect2, sum);
         L2SqrStep(pVect1, pVect2, sum);
-    } else if (residual >= 24) {
+    } else if constexpr (residual >= 24) {
         L2SqrStep(pVect1, pVect2, sum);
         L2SqrStep(pVect1, pVect2, sum);
         L2SqrStep(pVect1, pVect2, sum);
diff --git a/src/VecSim/spaces/L2/L2_SSE_FP64.h b/src/VecSim/spaces/L2/L2_SSE_FP64.h
@@ -25,7 +25,7 @@ double FP64_L2SqrSIMD8_SSE(const void *pVect1v, const void *pVect2v, size_t dime
     __m128d sum = _mm_setzero_pd();
 
     // If residual is odd, we load 1 double and set the last one to 0
-    if (residual % 2 == 1) {
+    if constexpr (residual % 2 == 1) {
         __m128d v1 = _mm_load_sd(pVect1);
         pVect1++;
         __m128d v2 = _mm_load_sd(pVect2);
@@ -35,11 +35,11 @@ double FP64_L2SqrSIMD8_SSE(const void *pVect1v, const void *pVect2v, size_t dime
     }
 
     // have another 1, 2 or 3 2-double steps according to residual
-    if (residual >= 6)
+    if constexpr (residual >= 6)
         L2SqrStep(pVect1, pVect2, sum);
-    if (residual >= 4)
+    if constexpr (residual >= 4)
         L2SqrStep(pVect1, pVect2, sum);
-    if (residual >= 2)
+    if constexpr (residual >= 2)
         L2SqrStep(pVect1, pVect2, sum);
 
     // We dealt with the residual part. We are left with some multiple of 8 doubles.

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ float BF16_InnerProductSIMD32_AVX2(const void pVect1v, const void pVect2v, siz`
`108`	`108`	`}`
`109`	`109`
`110`	`110`	`// Do a single step if residual >=16`
`111`		`- if (residual >= 16) {`
	`111`	`+ if constexpr (residual >= 16) {`
`112`	`112`	`InnerProductStep(pVect1, pVect2, sum_prod);`
`113`	`113`	`}`
`114`	`114`
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ float FP16_InnerProductSIMD32_AVX512(const void pVect1v, const void pVect2v, s`
`31`	`31`
`32`	`32`	`auto sum = _mm512_setzero_ps();`
`33`	`33`
`34`		`- if (residual % 16) {`
	`34`	`+ if constexpr (residual % 16) {`
`35`	`35`	// Deal with remainder first. `dim` is more than 32, so we have at least one block of 32
`36`	`36`	`// 16-bit float so mask loading is guaranteed to be safe.`
`37`	`37`	`__mmask16 constexpr residuals_mask = (1 << (residual % 16)) - 1;`
`@@ -46,7 +46,7 @@ float FP16_InnerProductSIMD32_AVX512(const void pVect1v, const void pVect2v, s`
`46`	`46`	`pVect1 += residual % 16;`
`47`	`47`	`pVect2 += residual % 16;`
`48`	`48`	`}`
`49`		`- if (residual >= 16) {`
	`49`	`+ if constexpr (residual >= 16) {`
`50`	`50`	`InnerProductStep(pVect1, pVect2, sum);`
`51`	`51`	`}`
`52`	`52`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ float FP32_InnerProductSIMD16_AVX(const void pVect1v, const void pVect2v, size`
`26`	`26`
`27`	`27`	// Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
`28`	`28`	`// 16-float block, so mask loading is guaranteed to be safe.`
`29`		`- if (residual % 8) {`
	`29`	`+ if constexpr (residual % 8) {`
`30`	`30`	`__mmask8 constexpr mask = (1 << (residual % 8)) - 1;`
`31`	`31`	`__m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);`
`32`	`32`	`pVect1 += residual % 8;`
`@@ -36,7 +36,7 @@ float FP32_InnerProductSIMD16_AVX(const void pVect1v, const void pVect2v, size`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`// If the reminder is >=8, have another step of 8 floats`
`39`		`- if (residual >= 8) {`
	`39`	`+ if constexpr (residual >= 8) {`
`40`	`40`	`InnerProductStep(pVect1, pVect2, sum256);`
`41`	`41`	`}`
`42`	`42`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ double FP64_InnerProductSIMD8_AVX(const void pVect1v, const void pVect2v, size`
`26`	`26`
`27`	`27`	// Deal with 1-3 doubles with mask loading, if needed. `dim` is >8, so we have at least one
`28`	`28`	`// 8-double block, so mask loading is guaranteed to be safe.`
`29`		`- if (residual % 4) {`
	`29`	`+ if constexpr (residual % 4) {`
`30`	`30`	`// _mm256_maskz_loadu_pd is not available in AVX`
`31`	`31`	`__mmask8 constexpr mask = (1 << (residual % 4)) - 1;`
`32`	`32`	`__m256d v1 = my_mm256_maskz_loadu_pd<mask>(pVect1);`
`@@ -37,7 +37,7 @@ double FP64_InnerProductSIMD8_AVX(const void pVect1v, const void pVect2v, size`
`37`	`37`	`}`
`38`	`38`
`39`	`39`	`// If the reminder is >=4, have another step of 4 doubles`
`40`		`- if (residual >= 4) {`
	`40`	`+ if constexpr (residual >= 4) {`
`41`	`41`	`InnerProductStep(pVect1, pVect2, sum256);`
`42`	`42`	`}`
`43`	`43`
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ float BF16_L2SqrSIMD32_AVX2(const void pVect1v, const void pVect2v, size_t dim`
`106`	`106`	`}`
`107`	`107`
`108`	`108`	`// Do a single step if residual >=16`
`109`		`- if (residual >= 16) {`
	`109`	`+ if constexpr (residual >= 16) {`
`110`	`110`	`L2SqrStep(pVect1, pVect2, sum);`
`111`	`111`	`}`
`112`	`112`