@@ -52,17 +52,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
5252 BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y ,
5353 FLOAT * buffer )
5454{
55- BLASLONG i ;
56- BLASLONG ix , iy ;
57- BLASLONG j ;
58- FLOAT * a_ptr ;
55+ BLASLONG i , j ;
56+ BLASLONG ix = 0 ;
57+ BLASLONG iy ;
58+ FLOAT * a_ptr = a ;
5959 FLOAT temp ;
6060
61- ix = 0 ;
62- a_ptr = a ;
63-
6461 if (inc_y == 1 ) {
65- BLASLONG width = (n + 3 - 1 ) / 3 ;
62+ BLASLONG width = n / 3 ; // Only process full 3-column blocks
63+ BLASLONG sve_size = SV_COUNT ();
64+ svbool_t pg_full = SV_TRUE ();
65+ svbool_t pg_tail = SV_WHILE (0 , m % sve_size );
6666
6767 FLOAT * a0_ptr = a_ptr + lda * width * 0 ;
6868 FLOAT * a1_ptr = a_ptr + lda * width * 1 ;
@@ -73,57 +73,75 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
7373 FLOAT * x2_ptr = x + inc_x * width * 2 ;
7474
7575 for (j = 0 ; j < width ; j ++ ) {
76- svbool_t pg00 = (( j + width * 0 ) < n ) ? SV_TRUE () : svpfalse ( );
77- svbool_t pg01 = (( j + width * 1 ) < n ) ? SV_TRUE () : svpfalse ( );
78- svbool_t pg02 = (( j + width * 2 ) < n ) ? SV_TRUE () : svpfalse ( );
76+ SV_TYPE temp0_vec = SV_DUP ( alpha * x0_ptr [ ix ] );
77+ SV_TYPE temp1_vec = SV_DUP ( alpha * x1_ptr [ ix ] );
78+ SV_TYPE temp2_vec = SV_DUP ( alpha * x2_ptr [ ix ] );
7979
80- SV_TYPE temp0_vec = ((j + width * 0 ) < n ) ? SV_DUP (alpha * x0_ptr [ix ]) : SV_DUP (0.0 );
81- SV_TYPE temp1_vec = ((j + width * 1 ) < n ) ? SV_DUP (alpha * x1_ptr [ix ]) : SV_DUP (0.0 );
82- SV_TYPE temp2_vec = ((j + width * 2 ) < n ) ? SV_DUP (alpha * x2_ptr [ix ]) : SV_DUP (0.0 );
8380 i = 0 ;
84- BLASLONG sve_size = SV_COUNT ();
85- while ((i + sve_size * 1 - 1 ) < m ) {
86- SV_TYPE y0_vec = svld1_vnum (SV_TRUE (), y + i , 0 );
81+ while ((i + sve_size - 1 ) < m ) {
82+ SV_TYPE y0_vec = svld1 (pg_full , y + i );
8783
88- SV_TYPE a00_vec = svld1_vnum ( pg00 , a0_ptr + i , 0 );
89- SV_TYPE a01_vec = svld1_vnum ( pg01 , a1_ptr + i , 0 );
90- SV_TYPE a02_vec = svld1_vnum ( pg02 , a2_ptr + i , 0 );
84+ SV_TYPE a00_vec = svld1 ( pg_full , a0_ptr + i );
85+ SV_TYPE a01_vec = svld1 ( pg_full , a1_ptr + i );
86+ SV_TYPE a02_vec = svld1 ( pg_full , a2_ptr + i );
9187
92- y0_vec = svmla_m ( pg00 , y0_vec , temp0_vec , a00_vec );
93- y0_vec = svmla_m ( pg01 , y0_vec , temp1_vec , a01_vec );
94- y0_vec = svmla_m ( pg02 , y0_vec , temp2_vec , a02_vec );
88+ y0_vec = svmla_x ( pg_full , y0_vec , temp0_vec , a00_vec );
89+ y0_vec = svmla_x ( pg_full , y0_vec , temp1_vec , a01_vec );
90+ y0_vec = svmla_x ( pg_full , y0_vec , temp2_vec , a02_vec );
9591
96- svst1_vnum ( SV_TRUE () , y + i , 0 , y0_vec );
97- i += sve_size * 1 ;
92+ svst1 ( pg_full , y + i , y0_vec );
93+ i += sve_size ;
9894 }
9995
10096 if (i < m ) {
101- svbool_t pg0 = SV_WHILE (i + sve_size * 0 , m );
102-
103- pg00 = svand_z (SV_TRUE (), pg0 , pg00 );
104- pg01 = svand_z (SV_TRUE (), pg0 , pg01 );
105- pg02 = svand_z (SV_TRUE (), pg0 , pg02 );
97+ SV_TYPE y0_vec = svld1 (pg_tail , y + i );
10698
107- SV_TYPE y0_vec = svld1_vnum (pg0 , y + i , 0 );
99+ SV_TYPE a00_vec = svld1 (pg_tail , a0_ptr + i );
100+ SV_TYPE a01_vec = svld1 (pg_tail , a1_ptr + i );
101+ SV_TYPE a02_vec = svld1 (pg_tail , a2_ptr + i );
108102
109- SV_TYPE a00_vec = svld1_vnum ( pg00 , a0_ptr + i , 0 );
110- SV_TYPE a01_vec = svld1_vnum ( pg01 , a1_ptr + i , 0 );
111- SV_TYPE a02_vec = svld1_vnum ( pg02 , a2_ptr + i , 0 );
103+ y0_vec = svmla_m ( pg_tail , y0_vec , temp0_vec , a00_vec );
104+ y0_vec = svmla_m ( pg_tail , y0_vec , temp1_vec , a01_vec );
105+ y0_vec = svmla_m ( pg_tail , y0_vec , temp2_vec , a02_vec );
112106
113- y0_vec = svmla_m (pg00 , y0_vec , temp0_vec , a00_vec );
114- y0_vec = svmla_m (pg01 , y0_vec , temp1_vec , a01_vec );
115- y0_vec = svmla_m (pg02 , y0_vec , temp2_vec , a02_vec );
116-
117- svst1_vnum (pg0 , y + i , 0 , y0_vec );
107+ svst1 (pg_tail , y + i , y0_vec );
118108 }
119109 a0_ptr += lda ;
120110 a1_ptr += lda ;
121111 a2_ptr += lda ;
122112 ix += inc_x ;
123113 }
114+ // Handle remaining n % 3 columns
115+ for (j = width * 3 ; j < n ; j ++ ) {
116+ FLOAT * a_col = a + j * lda ;
117+ temp = alpha * x [j * inc_x ];
118+ SV_TYPE temp_vec = SV_DUP (temp );
119+
120+ i = 0 ;
121+ while ((i + sve_size - 1 ) < m ) {
122+ SV_TYPE y_vec = svld1 (pg_full , y + i );
123+
124+ SV_TYPE a_vec = svld1 (pg_full , a_col + i );
125+
126+ y_vec = svmla_x (pg_full , y_vec , temp_vec , a_vec );
127+
128+ svst1 (pg_full , y + i , y_vec );
129+ i += sve_size ;
130+ }
131+ if (i < m ) {
132+ SV_TYPE y_vec = svld1 (pg_tail , y + i );
133+
134+ SV_TYPE a_vec = svld1 (pg_tail , a_col + i );
135+
136+ y_vec = svmla_m (pg_tail , y_vec , temp_vec , a_vec );
137+
138+ svst1 (pg_tail , y + i , y_vec );
139+ }
140+ }
124141 return (0 );
125142 }
126143
144+ // Fallback scalar loop
127145 for (j = 0 ; j < n ; j ++ ) {
128146 temp = alpha * x [ix ];
129147 iy = 0 ;
0 commit comments