@@ -157,19 +157,35 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void
157157
158158 __m512 sum512 = _mm512_set1_ps (0 );
159159
160- while (pVect1 < pEnd1) {
161- // _mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
162-
160+ size_t loop = qty16 / 4 ;
161+ for ( int i = 0 ; i < loop; i++) {
163162 __m512 v1 = _mm512_loadu_ps (pVect1);
164- pVect1 += 16 ;
165163 __m512 v2 = _mm512_loadu_ps (pVect2);
164+ pVect1 += 16 ;
165+ pVect2 += 16 ;
166+
167+ __m512 v3 = _mm512_loadu_ps (pVect1);
168+ __m512 v4 = _mm512_loadu_ps (pVect2);
169+ pVect1 += 16 ;
170+ pVect2 += 16 ;
171+
172+ __m512 v5 = _mm512_loadu_ps (pVect1);
173+ __m512 v6 = _mm512_loadu_ps (pVect2);
174+ pVect1 += 16 ;
166175 pVect2 += 16 ;
167- sum512 = _mm512_add_ps (sum512, _mm512_mul_ps (v1, v2));
168- }
169176
170- _mm512_store_ps (TmpRes, sum512);
171- float sum = TmpRes[0 ] + TmpRes[1 ] + TmpRes[2 ] + TmpRes[3 ] + TmpRes[4 ] + TmpRes[5 ] + TmpRes[6 ] + TmpRes[7 ] + TmpRes[8 ] + TmpRes[9 ] + TmpRes[10 ] + TmpRes[11 ] + TmpRes[12 ] + TmpRes[13 ] + TmpRes[14 ] + TmpRes[15 ];
177+ __m512 v7 = _mm512_loadu_ps (pVect1);
178+ __m512 v8 = _mm512_loadu_ps (pVect2);
179+ pVect1 += 16 ;
180+ pVect2 += 16 ;
181+
182+ sum512 = _mm512_fmadd_ps (v1, v2, sum512);
183+ sum512 = _mm512_fmadd_ps (v3, v4, sum512);
184+ sum512 = _mm512_fmadd_ps (v5, v6, sum512);
185+ sum512 = _mm512_fmadd_ps (v7, v8, sum512);
186+ }
172187
188+ float sum = _mm512_reduce_add_ps (sum512);
173189 return sum;
174190}
175191
0 commit comments