Skip to content

Commit 9291020

Browse files
committed
InnerProductSIMD16ExtAVX512 Efficient AVX512 instruction implementation consider the size of a Vector that is not divisible by 4
1 parent 74f14d2 commit 9291020

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

hnswlib/space_ip.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void
158158
__m512 sum512 = _mm512_set1_ps(0);
159159

160160
size_t loop = qty16 / 4;
161-
for( int i = 0; i < loop; i++) {
161+
162+
while (loop--) {
162163
__m512 v1 = _mm512_loadu_ps(pVect1);
163164
__m512 v2 = _mm512_loadu_ps(pVect2);
164165
pVect1 += 16;
@@ -185,6 +186,14 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void
185186
sum512 = _mm512_fmadd_ps(v7, v8, sum512);
186187
}
187188

189+
while (pVect1 < pEnd1) {
190+
__m512 v1 = _mm512_loadu_ps(pVect1);
191+
__m512 v2 = _mm512_loadu_ps(pVect2);
192+
pVect1 += 16;
193+
pVect2 += 16;
194+
sum512 = _mm512_fmadd_ps(v1, v2, sum512);
195+
}
196+
188197
float sum = _mm512_reduce_add_ps(sum512);
189198
return sum;
190199
}

0 commit comments

Comments
 (0)