@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333#define ALPHAI $f1
3434#define X $r7
3535#define INCX $r8
36+ #define DUMMY2 $r9
3637
3738#define I $r12
3839#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566
6667 bge $r0, N, .L999
6768 bge $r0, INCX, .L999
69+ ld.d DUMMY2, $sp, 0
6870 li.d TEMP, 1
6971 movgr2fr.d a1, $r0
7072 FFINT a1, a1
@@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8486 srai.d I, N, 2
8587 bne INCX, TEMP, .L22
8688
89+ /////// INCX == 1 ////////
8790.L11:
88- bge $r0, I, .L997
8991 CMPEQ $fcc0, ALPHAR, a1
9092 CMPEQ $fcc1, ALPHAI, a1
91- bceqz $fcc0, .L13
92- b .L14
93- .align 3
93+ bge $r0, I, .L19
9494
95- .L13:
96- bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
97- b .L113 //alpha_r != 0.0 && alpha_i == 0.0
95+ /////// INCX == 1 && N >= 4 ////////
96+ bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
9897
99- .L14:
100- bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
101- b .L111 //alpha_r == 0.0 && alpha_i == 0.0
102- .align 3
98+ bceqz $fcc0, .L17
10399
104- .L111: //alpha_r == 0.0 && alpha_i == 0.0
100+ bceqz $fcc1, .L17
101+
102+ .L15: //alpha_r == 0.0 && alpha_i == 0.0
105103 vst VXZ, X, 0 * SIZE
106104#ifdef DOUBLE
107105 vst VXZ, X, 2 * SIZE
@@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
112110#endif
113111 addi.d X, X, 8 * SIZE
114112 addi.d I, I, -1
115- blt $r0, I, .L111
116- b .L997
117- .align 3
118-
119- .L113: //alpha_r != 0.0 && alpha_i == 0.0
120- vld VX0, X, 0 * SIZE
121- #ifdef DOUBLE
122- vld VX1, X, 2 * SIZE
123- vpickev.d x1, VX1, VX0
124- vpickod.d x2, VX1, VX0
125- vfmul.d x3, VXAR, x1
126- vfmul.d x4, VXAR, x2
127- vilvl.d VX2, x4 ,x3
128- vilvh.d VX3, x4, x3
129- vst VX2, X, 0 * SIZE
130- vst VX3, X, 2 * SIZE
131- vld VX0, X, 4 * SIZE
132- vld VX1, X, 6 * SIZE
133- vpickev.d x1, VX1, VX0
134- vpickod.d x2, VX1, VX0
135- vfmul.d x3, VXAR, x1
136- vfmul.d x4, VXAR, x2
137- vilvl.d VX2, x4 ,x3
138- vilvh.d VX3, x4, x3
139- vst VX2, X, 4 * SIZE
140- vst VX3, X, 6 * SIZE
141- #else
142- vld VX1, X, 4 * SIZE
143- vpickev.w x1, VX1, VX0
144- vpickod.w x2, VX1, VX0
145- vfmul.s x3, VXAR, x1
146- vfmul.s x4, VXAR, x2
147- vilvl.w VX2, x4 ,x3
148- vilvh.w VX3, x4, x3
149- vst VX2, X, 0 * SIZE
150- vst VX3, X, 4 * SIZE
151- #endif
152- addi.d X, X, 8 * SIZE
153- addi.d I, I, -1
154- blt $r0, I, .L113
155- b .L997
113+ blt $r0, I, .L15
114+ b .L19
156115 .align 3
157116
158- .L114: //alpha_r != 0.0 && alpha_i != 0.0
117+ .L17:
159118 vld VX0, X, 0 * SIZE
160119#ifdef DOUBLE
161120 vld VX1, X, 2 * SIZE
@@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
196155#endif
197156 addi.d X, X, 8 * SIZE
198157 addi.d I, I, -1
199- blt $r0, I, .L114
200- b .L997
158+ blt $r0, I, .L17
159+ b .L19
201160 .align 3
202161
162+ /////// INCX == 1 && N < 8 ///////
163+ .L19:
164+ andi I, N, 3
165+ beqz I, .L999
166+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
167+
168+ bceqz $fcc0, .L998
169+
170+ bceqz $fcc1, .L998
171+
172+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
173+
174+ /////// INCX != 1 ////////
203175.L22:
204- bge $r0, I, .L997
205- move XX, X
206176 CMPEQ $fcc0, ALPHAR, a1
207177 CMPEQ $fcc1, ALPHAI, a1
208- bceqz $fcc0, .L23
209- b .L24
210- .align 3
178+ move XX, X
179+ bge $r0, I, .L29
180+ bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
211181
212- .L23:
213- bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
214- b .L223 //alpha_r != 0.0 && alpha_i == 0.0
182+ bceqz $fcc0, .L25
215183
216- .L24:
217- bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
218- b .L221 //alpha_r == 0.0 && alpha_i == 0.0
219- .align 3
184+ bceqz $fcc1, .L25
220185
221- .L221 : //alpha_r == 0.0 && alpha_i == 0.0
186+ .L27 : //alpha_r == 0.0 && alpha_i == 0.0
222187#ifdef DOUBLE
223188 vstelm.d VXZ, X, 0 , 0
224189 vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
246211#endif
247212 add .d X, X, INCX
248213 addi.d I, I, -1
249- blt $r0, I, .L221
250- b .L997
214+ blt $r0, I, .L27
215+ b .L29
251216 .align 3
252217
253- .L223: //alpha_r != 0.0 && alpha_i == 0.0
254- #ifdef DOUBLE
255- ld.d t1, X, 0 * SIZE
256- ld.d t2, X, 1 * SIZE
257- add .d X, X, INCX
258- ld.d t3, X, 0 * SIZE
259- ld.d t4, X, 1 * SIZE
260- add .d X, X, INCX
261- vinsgr2vr.d x1, t1, 0
262- vinsgr2vr.d x2, t2, 0
263- vinsgr2vr.d x1, t3, 1
264- vinsgr2vr.d x2, t4, 1
265- vfmul.d x3, VXAR, x1
266- vfmul.d x4, VXAR, x2
267- vstelm.d x3, XX, 0 * SIZE, 0
268- vstelm.d x4, XX, 1 * SIZE, 0
269- add .d XX, XX, INCX
270- vstelm.d x3, XX, 0 * SIZE, 1
271- vstelm.d x4, XX, 1 * SIZE, 1
272- add .d XX, XX, INCX
273-
274- ld.d t1, X, 0 * SIZE
275- ld.d t2, X, 1 * SIZE
276- add .d X, X, INCX
277- ld.d t3, X, 0 * SIZE
278- ld.d t4, X, 1 * SIZE
279- vinsgr2vr.d x1, t1, 0
280- vinsgr2vr.d x2, t2, 0
281- vinsgr2vr.d x1, t3, 1
282- vinsgr2vr.d x2, t4, 1
283- add .d X, X, INCX
284- vfmul.d x3, VXAR, x1
285- vfmul.d x4, VXAR, x2
286- addi.d I, I, -1
287- vstelm.d x3, XX, 0 * SIZE, 0
288- vstelm.d x4, XX, 1 * SIZE, 0
289- add .d XX, XX, INCX
290- vstelm.d x3, XX, 0 * SIZE, 1
291- vstelm.d x4, XX, 1 * SIZE, 1
292- #else
293- ld.w t1, X, 0 * SIZE
294- ld.w t2, X, 1 * SIZE
295- add .d X, X, INCX
296- ld.w t3, X, 0 * SIZE
297- ld.w t4, X, 1 * SIZE
298- add .d X, X, INCX
299- vinsgr2vr.w x1, t1, 0
300- vinsgr2vr.w x2, t2, 0
301- vinsgr2vr.w x1, t3, 1
302- vinsgr2vr.w x2, t4, 1
303- ld.w t1, X, 0 * SIZE
304- ld.w t2, X, 1 * SIZE
305- add .d X, X, INCX
306- ld.w t3, X, 0 * SIZE
307- ld.w t4, X, 1 * SIZE
308- vinsgr2vr.w x1, t1, 2
309- vinsgr2vr.w x2, t2, 2
310- vinsgr2vr.w x1, t3, 3
311- vinsgr2vr.w x2, t4, 3
312- add .d X, X, INCX
313-
314- vfmul.s x3, VXAR, x1
315- vfmul.s x4, VXAR, x2
316- addi.d I, I, -1
317- vstelm.w x3, XX, 0 * SIZE, 0
318- vstelm.w x4, XX, 1 * SIZE, 0
319- add .d XX, XX, INCX
320- vstelm.w x3, XX, 0 * SIZE, 1
321- vstelm.w x4, XX, 1 * SIZE, 1
322- add .d XX, XX, INCX
323- vstelm.w x3, XX, 0 * SIZE, 2
324- vstelm.w x4, XX, 1 * SIZE, 2
325- add .d XX, XX, INCX
326- vstelm.w x3, XX, 0 * SIZE, 3
327- vstelm.w x4, XX, 1 * SIZE, 3
328- #endif
329- add .d XX, XX, INCX
330- blt $r0, I, .L223
331- b .L997
332- .align 3
333-
334- .L224: //alpha_r != 0.0 && alpha_i != 0.0
218+ .L25:
335219#ifdef DOUBLE
336220 ld.d t1, X, 0 * SIZE
337221 ld.d t2, X, 1 * SIZE
@@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
414298 vstelm.w x4, XX, 1 * SIZE, 3
415299#endif
416300 add .d XX, XX, INCX
417- blt $r0, I, .L224
418- b .L997
301+ blt $r0, I, .L25
302+ b .L29
419303 .align 3
420304
421- .L997:
422- andi I, N, 3
423- bge $r0, I, .L999
424- .align 3
305+ /////// INCX != 1 && N < 8 ///////
306+ .L29:
307+ andi I, N, 3
308+ beqz I, .L999
309+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
310+
311+ bceqz $fcc0, .L998
312+
313+ bceqz $fcc1, .L998
425314
315+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
316+
317+ .L995: // alpha_r == 0.0 && alpha_i == 0.0
318+ ST a1, X, 0 * SIZE
319+ ST a1, X, 1 * SIZE
320+ addi.d I, I, -1
321+ add .d X, X, INCX
322+ blt $r0, I, .L995
323+ b .L999
426324.L998:
427325 LD a1, X, 0 * SIZE
428326 LD a2, X, 1 * SIZE
@@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
435333 ST s2, X, 1 * SIZE
436334 add .d X, X, INCX
437335 blt $r0, I, .L998
438- .align 3
336+ b .L999
439337
440338.L999:
441339 move $r4, $r12
0 commit comments