|
1 | 1 | /*********************************************************************/ |
2 | | -/* Copyright 2024 The OpenBLAS Project */ |
| 2 | +/* Copyright 2024, 2025 The OpenBLAS Project */ |
3 | 3 | /* Copyright 2009, 2010 The University of Texas at Austin. */ |
4 | 4 | /* All rights reserved. */ |
5 | 5 | /* */ |
@@ -177,6 +177,49 @@ static int init_amxtile_permission() { |
177 | 177 | } |
178 | 178 | #endif |
179 | 179 |
|
| 180 | +#ifdef DYNAMIC_ARCH |
| 181 | +extern char* gotoblas_corename(void); |
| 182 | +#endif |
| 183 | + |
| 184 | +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) |
| 185 | +static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { |
| 186 | + return |
| 187 | + MNK < 262144L ? 1 |
| 188 | + : MNK < 1124864L ? MIN(ncpu, 6) |
| 189 | + : MNK < 7880599L ? MIN(ncpu, 12) |
| 190 | + : MNK < 17173512L ? MIN(ncpu, 16) |
| 191 | + : MNK < 33386248L ? MIN(ncpu, 20) |
| 192 | + : MNK < 57066625L ? MIN(ncpu, 24) |
| 193 | + : MNK < 91733851L ? MIN(ncpu, 32) |
| 194 | + : MNK < 265847707L ? MIN(ncpu, 40) |
| 195 | + : MNK < 458314011L ? MIN(ncpu, 48) |
| 196 | + : MNK < 729000000L ? MIN(ncpu, 56) |
| 197 | + : ncpu; |
| 198 | +} |
| 199 | +#endif |
| 200 | + |
| 201 | +static inline int get_gemm_optimal_nthreads(double MNK) { |
| 202 | + int ncpu = num_cpu_avail(3); |
| 203 | +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) |
| 204 | + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); |
| 205 | +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) |
| 206 | + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { |
| 207 | + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); |
| 208 | + } |
| 209 | +#endif |
| 210 | + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { |
| 211 | + return 1; |
| 212 | + } |
| 213 | + else { |
| 214 | + if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { |
| 215 | + return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); |
| 216 | + } |
| 217 | + else { |
| 218 | + return ncpu; |
| 219 | + } |
| 220 | + } |
| 221 | +} |
| 222 | + |
180 | 223 | #ifndef CBLAS |
181 | 224 |
|
182 | 225 | void NAME(char *TRANSA, char *TRANSB, |
@@ -310,7 +353,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS |
310 | 353 | FLOAT *beta = (FLOAT*) vbeta; |
311 | 354 | FLOAT *a = (FLOAT*) va; |
312 | 355 | FLOAT *b = (FLOAT*) vb; |
313 | | - FLOAT *c = (FLOAT*) vc; |
| 356 | + FLOAT *c = (FLOAT*) vc; |
314 | 357 | #endif |
315 | 358 |
|
316 | 359 | blas_arg_t args; |
@@ -352,7 +395,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS |
352 | 395 | #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) |
353 | 396 | #ifdef DYNAMIC_ARCH |
354 | 397 | if (support_avx512() ) |
355 | | -#endif |
| 398 | +#endif |
356 | 399 | if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { |
357 | 400 | SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); |
358 | 401 | return; |
@@ -604,13 +647,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS |
604 | 647 | #endif |
605 | 648 |
|
606 | 649 | MNK = (double) args.m * (double) args.n * (double) args.k; |
607 | | - if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) |
608 | | - args.nthreads = 1; |
609 | | - else { |
610 | | - args.nthreads = num_cpu_avail(3); |
611 | | - if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) |
612 | | - args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); |
613 | | - } |
| 650 | + args.nthreads = get_gemm_optimal_nthreads(MNK); |
614 | 651 |
|
615 | 652 | args.common = NULL; |
616 | 653 |
|
|
0 commit comments