Skip to content

Commit f30b6e1

Browse files
authored
Merge pull request #475 from aurora327/efficient_avx512_instruction
Efficient AVX512 implementation in 'InnerProductSIMD16ExtAVX512' Function
2 parents 0df757e + 9291020 commit f30b6e1

File tree

1 file changed

+32
-7
lines changed

1 file changed

+32
-7
lines changed

hnswlib/space_ip.h

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,19 +157,44 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void
157157

158158
__m512 sum512 = _mm512_set1_ps(0);
159159

160-
while (pVect1 < pEnd1) {
161-
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
162-
160+
size_t loop = qty16 / 4;
161+
162+
while (loop--) {
163163
__m512 v1 = _mm512_loadu_ps(pVect1);
164-
pVect1 += 16;
165164
__m512 v2 = _mm512_loadu_ps(pVect2);
165+
pVect1 += 16;
166+
pVect2 += 16;
167+
168+
__m512 v3 = _mm512_loadu_ps(pVect1);
169+
__m512 v4 = _mm512_loadu_ps(pVect2);
170+
pVect1 += 16;
171+
pVect2 += 16;
172+
173+
__m512 v5 = _mm512_loadu_ps(pVect1);
174+
__m512 v6 = _mm512_loadu_ps(pVect2);
175+
pVect1 += 16;
166176
pVect2 += 16;
167-
sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
177+
178+
__m512 v7 = _mm512_loadu_ps(pVect1);
179+
__m512 v8 = _mm512_loadu_ps(pVect2);
180+
pVect1 += 16;
181+
pVect2 += 16;
182+
183+
sum512 = _mm512_fmadd_ps(v1, v2, sum512);
184+
sum512 = _mm512_fmadd_ps(v3, v4, sum512);
185+
sum512 = _mm512_fmadd_ps(v5, v6, sum512);
186+
sum512 = _mm512_fmadd_ps(v7, v8, sum512);
168187
}
169188

170-
_mm512_store_ps(TmpRes, sum512);
171-
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
189+
while (pVect1 < pEnd1) {
190+
__m512 v1 = _mm512_loadu_ps(pVect1);
191+
__m512 v2 = _mm512_loadu_ps(pVect2);
192+
pVect1 += 16;
193+
pVect2 += 16;
194+
sum512 = _mm512_fmadd_ps(v1, v2, sum512);
195+
}
172196

197+
float sum = _mm512_reduce_add_ps(sum512);
173198
return sum;
174199
}
175200

0 commit comments

Comments
 (0)