@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333#define ALPHAI $f1
3434#define X $r7
3535#define INCX $r8
36+ #define DUMMY2 $r9
3637
3738#define I $r12
3839#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566
6667 bge $r0, N, .L999
6768 bge $r0, INCX, .L999
69+ ld.d DUMMY2, $sp, 0
6870 li.d TEMP, 1
6971 movgr2fr.d a1, $r0
7072 FFINT a1, a1
@@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8688#endif
8789 bne INCX, TEMP, .L22
8890
91+ /////// INCX == 1 ////////
8992.L11:
90- bge $r0, I, .L997
9193 CMPEQ $fcc0, ALPHAR, a1
9294 CMPEQ $fcc1, ALPHAI, a1
93- bceqz $fcc0, .L13
94- b .L14
95- .align 3
95+ bge $r0, I, .L19
96+ /////// INCX == 1 && N >= 4 ////////
97+ bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
9698
97- .L13:
98- bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
99- b .L113 //alpha_r != 0.0 && alpha_i == 0.0
99+ bceqz $fcc0, .L17
100100
101- .L14:
102- bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
103- b .L111 //alpha_r == 0.0 && alpha_i == 0.0
104- .align 3
101+ bceqz $fcc1, .L17
105102
106- .L111 : //alpha_r == 0.0 && alpha_i == 0.0
103+ .L15 : //alpha_r == 0.0 && alpha_i == 0.0
107104 xvst VXZ, X, 0 * SIZE
108105#ifdef DOUBLE
109106 xvst VXZ, X, 4 * SIZE
@@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113110 addi.d X, X, 16 * SIZE
114111#endif
115112 addi.d I, I, -1
116- blt $r0, I, .L111
117- b .L997
113+ blt $r0, I, .L15
114+ b .L19
118115 .align 3
119116
120- .L113: //alpha_r != 0.0 && alpha_i == 0.0
121- xvld VX0, X, 0 * SIZE
122- #ifdef DOUBLE
123- xvld VX1, X, 4 * SIZE
124- xvpickev.d x1, VX1, VX0
125- xvpickod.d x2, VX1, VX0
126- xvfmul.d x3, VXAR, x1
127- xvfmul.d x4, VXAR, x2
128- xvilvl.d VX2, x4 ,x3
129- xvilvh.d VX3, x4, x3
130- xvst VX2, X, 0 * SIZE
131- xvst VX3, X, 4 * SIZE
132- addi.d X, X, 8 * SIZE
133- #else
134- xvld VX1, X, 8 * SIZE
135- xvpickev.w x1, VX1, VX0
136- xvpickod.w x2, VX1, VX0
137- xvfmul.s x3, VXAR, x1
138- xvfmul.s x4, VXAR, x2
139- xvilvl.w VX2, x4 ,x3
140- xvilvh.w VX3, x4, x3
141- xvst VX2, X, 0 * SIZE
142- xvst VX3, X, 8 * SIZE
143- addi.d X, X, 16 * SIZE
144- #endif
145- addi.d I, I, -1
146- blt $r0, I, .L113
147- b .L997
148- .align 3
149-
150- .L114: //alpha_r != 0.0 && alpha_i != 0.0
117+ .L17:
151118 xvld VX0, X, 0 * SIZE
152119#ifdef DOUBLE
153120 xvld VX1, X, 4 * SIZE
@@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
177144 addi.d X, X, 16 * SIZE
178145#endif
179146 addi.d I, I, -1
180- blt $r0, I, .L114
181- b .L997
147+ blt $r0, I, .L17
148+ b .L19
149+ .align 3
150+
151+ /////// INCX == 1 && N < 8 ///////
152+ .L19:
153+ #ifdef DOUBLE
154+ andi I, N, 3
155+ #else
156+ andi I, N, 7
157+ #endif
158+ beqz I, .L999
159+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
160+
161+ bceqz $fcc0, .L998
162+
163+ bceqz $fcc1, .L998
164+
165+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
182166 .align 3
183167
168+ /////// INCX != 1 ////////
184169.L22:
185- bge $r0, I, .L997
186- move XX, X
187170 CMPEQ $fcc0, ALPHAR, a1
188171 CMPEQ $fcc1, ALPHAI, a1
189- bceqz $fcc0, .L23
190- b .L24
191- .align 3
192-
193- .L23:
194- bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
195- b .L223 //alpha_r != 0.0 && alpha_i == 0.0
172+ move XX, X
173+ bge $r0, I, .L29
174+ bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
175+ bceqz $fcc0, .L25
196176
197- .L24:
198- bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
199- b .L221 //alpha_r == 0.0 && alpha_i == 0.0
200- .align 3
177+ bceqz $fcc1, .L25
201178
202- .L221 : //alpha_r == 0.0 && alpha_i == 0.0
179+ .L27 : //alpha_r == 0.0 && alpha_i == 0.0
203180#ifdef DOUBLE
204181 xvstelm.d VXZ, X, 0 , 0
205182 xvstelm.d VXZ, X, 1 * SIZE, 0
@@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239216#endif
240217 add .d X, X, INCX
241218 addi.d I, I, -1
242- blt $r0, I, .L221
243- b .L997
219+ blt $r0, I, .L27
220+ b .L29
244221 .align 3
245222
246- .L223: //alpha_r != 0.0 && alpha_i == 0.0
247- #ifdef DOUBLE
248- ld.d t1, X, 0 * SIZE
249- ld.d t2, X, 1 * SIZE
250- add .d X, X, INCX
251- ld.d t3, X, 0 * SIZE
252- ld.d t4, X, 1 * SIZE
253- add .d X, X, INCX
254- xvinsgr2vr.d x1, t1, 0
255- xvinsgr2vr.d x2, t2, 0
256- xvinsgr2vr.d x1, t3, 1
257- xvinsgr2vr.d x2, t4, 1
258- ld.d t1, X, 0 * SIZE
259- ld.d t2, X, 1 * SIZE
260- add .d X, X, INCX
261- ld.d t3, X, 0 * SIZE
262- ld.d t4, X, 1 * SIZE
263- xvinsgr2vr.d x1, t1, 2
264- xvinsgr2vr.d x2, t2, 2
265- xvinsgr2vr.d x1, t3, 3
266- xvinsgr2vr.d x2, t4, 3
267- add .d X, X, INCX
268-
269- xvfmul.d x3, VXAR, x1
270- xvfmul.d x4, VXAR, x2
271- addi.d I, I, -1
272- xvstelm.d x3, XX, 0 * SIZE, 0
273- xvstelm.d x4, XX, 1 * SIZE, 0
274- add .d XX, XX, INCX
275- xvstelm.d x3, XX, 0 * SIZE, 1
276- xvstelm.d x4, XX, 1 * SIZE, 1
277- add .d XX, XX, INCX
278- xvstelm.d x3, XX, 0 * SIZE, 2
279- xvstelm.d x4, XX, 1 * SIZE, 2
280- add .d XX, XX, INCX
281- xvstelm.d x3, XX, 0 * SIZE, 3
282- xvstelm.d x4, XX, 1 * SIZE, 3
283- #else
284- ld.w t1, X, 0 * SIZE
285- ld.w t2, X, 1 * SIZE
286- add .d X, X, INCX
287- ld.w t3, X, 0 * SIZE
288- ld.w t4, X, 1 * SIZE
289- add .d X, X, INCX
290- xvinsgr2vr.w x1, t1, 0
291- xvinsgr2vr.w x2, t2, 0
292- xvinsgr2vr.w x1, t3, 1
293- xvinsgr2vr.w x2, t4, 1
294- ld.w t1, X, 0 * SIZE
295- ld.w t2, X, 1 * SIZE
296- add .d X, X, INCX
297- ld.w t3, X, 0 * SIZE
298- ld.w t4, X, 1 * SIZE
299- xvinsgr2vr.w x1, t1, 2
300- xvinsgr2vr.w x2, t2, 2
301- xvinsgr2vr.w x1, t3, 3
302- xvinsgr2vr.w x2, t4, 3
303- add .d X, X, INCX
304- ld.w t1, X, 0 * SIZE
305- ld.w t2, X, 1 * SIZE
306- add .d X, X, INCX
307- ld.w t3, X, 0 * SIZE
308- ld.w t4, X, 1 * SIZE
309- add .d X, X, INCX
310- xvinsgr2vr.w x1, t1, 4
311- xvinsgr2vr.w x2, t2, 4
312- xvinsgr2vr.w x1, t3, 5
313- xvinsgr2vr.w x2, t4, 5
314- ld.w t1, X, 0 * SIZE
315- ld.w t2, X, 1 * SIZE
316- add .d X, X, INCX
317- ld.w t3, X, 0 * SIZE
318- ld.w t4, X, 1 * SIZE
319- xvinsgr2vr.w x1, t1, 6
320- xvinsgr2vr.w x2, t2, 6
321- xvinsgr2vr.w x1, t3, 7
322- xvinsgr2vr.w x2, t4, 7
323- add .d X, X, INCX
324-
325- xvfmul.s x3, VXAR, x1
326- xvfmul.s x4, VXAR, x2
327- addi.d I, I, -1
328- xvstelm.w x3, XX, 0 * SIZE, 0
329- xvstelm.w x4, XX, 1 * SIZE, 0
330- add .d XX, XX, INCX
331- xvstelm.w x3, XX, 0 * SIZE, 1
332- xvstelm.w x4, XX, 1 * SIZE, 1
333- add .d XX, XX, INCX
334- xvstelm.w x3, XX, 0 * SIZE, 2
335- xvstelm.w x4, XX, 1 * SIZE, 2
336- add .d XX, XX, INCX
337- xvstelm.w x3, XX, 0 * SIZE, 3
338- xvstelm.w x4, XX, 1 * SIZE, 3
339- add .d XX, XX, INCX
340- xvstelm.w x3, XX, 0 * SIZE, 4
341- xvstelm.w x4, XX, 1 * SIZE, 4
342- add .d XX, XX, INCX
343- xvstelm.w x3, XX, 0 * SIZE, 5
344- xvstelm.w x4, XX, 1 * SIZE, 5
345- add .d XX, XX, INCX
346- xvstelm.w x3, XX, 0 * SIZE, 6
347- xvstelm.w x4, XX, 1 * SIZE, 6
348- add .d XX, XX, INCX
349- xvstelm.w x3, XX, 0 * SIZE, 7
350- xvstelm.w x4, XX, 1 * SIZE, 7
351- #endif
352- add .d XX, XX, INCX
353- blt $r0, I, .L223
354- b .L997
355- .align 3
356-
357- .L224: //alpha_r != 0.0 && alpha_i != 0.0
223+ .L25:
358224#ifdef DOUBLE
359225 ld.d t1, X, 0 * SIZE
360226 ld.d t2, X, 1 * SIZE
@@ -465,19 +331,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
465331 xvstelm.w x4, XX, 1 * SIZE, 7
466332#endif
467333 add .d XX, XX, INCX
468- blt $r0, I, .L224
469- b .L997
334+ blt $r0, I, .L25
335+ b .L29
470336 .align 3
471337
472- .L997:
338+ /////// INCX != 1 && N < 8 ///////
339+ .L29:
473340#ifdef DOUBLE
474- andi I, N, 3
341+ andi I, N, 3
475342#else
476- andi I, N, 7
343+ andi I, N, 7
477344#endif
478- bge $r0, I, .L999
479- .align 3
345+ beqz I, .L999
346+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
480347
348+ bceqz $fcc0, .L998
349+
350+ bceqz $fcc1, .L998
351+
352+ .L995: // alpha_r == 0.0 && alpha_i == 0.0
353+ ST a1, X, 0 * SIZE
354+ ST a1, X, 1 * SIZE
355+ addi.d I, I, -1
356+ add .d X, X, INCX
357+ blt $r0, I, .L995
358+ b .L999
481359.L998:
482360 LD a1, X, 0 * SIZE
483361 LD a2, X, 1 * SIZE
@@ -490,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
490368 ST s2, X, 1 * SIZE
491369 add .d X, X, INCX
492370 blt $r0, I, .L998
493- .align 3
371+ b .L999
494372
495373.L999:
496374 move $r4, $r12
0 commit comments