@@ -157,19 +157,44 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void
157157
158158 __m512 sum512 = _mm512_set1_ps (0 );
159159
160- while (pVect1 < pEnd1) {
161- // _mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
162-
160+ size_t loop = qty16 / 4 ;
161+
162+ while (loop--) {
163163 __m512 v1 = _mm512_loadu_ps (pVect1);
164- pVect1 += 16 ;
165164 __m512 v2 = _mm512_loadu_ps (pVect2);
165+ pVect1 += 16 ;
166+ pVect2 += 16 ;
167+
168+ __m512 v3 = _mm512_loadu_ps (pVect1);
169+ __m512 v4 = _mm512_loadu_ps (pVect2);
170+ pVect1 += 16 ;
171+ pVect2 += 16 ;
172+
173+ __m512 v5 = _mm512_loadu_ps (pVect1);
174+ __m512 v6 = _mm512_loadu_ps (pVect2);
175+ pVect1 += 16 ;
166176 pVect2 += 16 ;
167- sum512 = _mm512_add_ps (sum512, _mm512_mul_ps (v1, v2));
177+
178+ __m512 v7 = _mm512_loadu_ps (pVect1);
179+ __m512 v8 = _mm512_loadu_ps (pVect2);
180+ pVect1 += 16 ;
181+ pVect2 += 16 ;
182+
183+ sum512 = _mm512_fmadd_ps (v1, v2, sum512);
184+ sum512 = _mm512_fmadd_ps (v3, v4, sum512);
185+ sum512 = _mm512_fmadd_ps (v5, v6, sum512);
186+ sum512 = _mm512_fmadd_ps (v7, v8, sum512);
168187 }
169188
170- _mm512_store_ps (TmpRes, sum512);
171- float sum = TmpRes[0 ] + TmpRes[1 ] + TmpRes[2 ] + TmpRes[3 ] + TmpRes[4 ] + TmpRes[5 ] + TmpRes[6 ] + TmpRes[7 ] + TmpRes[8 ] + TmpRes[9 ] + TmpRes[10 ] + TmpRes[11 ] + TmpRes[12 ] + TmpRes[13 ] + TmpRes[14 ] + TmpRes[15 ];
189+ while (pVect1 < pEnd1) {
190+ __m512 v1 = _mm512_loadu_ps (pVect1);
191+ __m512 v2 = _mm512_loadu_ps (pVect2);
192+ pVect1 += 16 ;
193+ pVect2 += 16 ;
194+ sum512 = _mm512_fmadd_ps (v1, v2, sum512);
195+ }
172196
197+ float sum = _mm512_reduce_add_ps (sum512);
173198 return sum;
174199}
175200
0 commit comments