Skip to content

Commit f3c78c9

Browse files
almayneMousius
andcommitted
Add interleaving to sgemm and dgemm. Disentangle trmm and symm from gemm.
Co-authored-by: Chris Sidebottom <chris.sidebottom@arm.com>
1 parent 5aff62e commit f3c78c9

18 files changed

+4285
-46
lines changed

common_c.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@
107107
#define CTRMM_ILNNCOPY ctrmm_ilnncopy
108108
#define CTRMM_ILTUCOPY ctrmm_iltucopy
109109
#define CTRMM_ILTNCOPY ctrmm_iltncopy
110+
#define CCOMM_NCOPY ccomm_ncopy
111+
#define CCOMM_TCOPY ccomm_tcopy
110112

111113
#define CTRSM_IUNUCOPY ctrsm_iunucopy
112114
#define CTRSM_IUNNCOPY ctrsm_iunncopy
@@ -125,6 +127,11 @@
125127
#define CGEMM_KERNEL_R cgemm_kernel_r
126128
#define CGEMM_KERNEL_B cgemm_kernel_b
127129

130+
#define CCOMM_KERNEL_N ccomm_kernel_n
131+
#define CCOMM_KERNEL_L ccomm_kernel_l
132+
#define CCOMM_KERNEL_R ccomm_kernel_r
133+
#define CCOMM_KERNEL_B ccomm_kernel_b
134+
128135
#define CTRMM_KERNEL_LN ctrmm_kernel_LN
129136
#define CTRMM_KERNEL_LT ctrmm_kernel_LT
130137
#define CTRMM_KERNEL_LR ctrmm_kernel_LR
@@ -320,17 +327,25 @@
320327
#define CTRMM_IUTNCOPY gotoblas -> ctrmm_iutncopy
321328
#define CTRMM_ILNNCOPY gotoblas -> ctrmm_ilnncopy
322329
#define CTRMM_ILTNCOPY gotoblas -> ctrmm_iltncopy
330+
#define CCOMM_NCOPY gotoblas -> ccomm_ncopy
331+
#define CCOMM_TCOPY gotoblas -> ccomm_tcopy
332+
323333
#define CTRSM_IUNNCOPY gotoblas -> ctrsm_iunncopy
324334
#define CTRSM_IUTNCOPY gotoblas -> ctrsm_iutncopy
325335
#define CTRSM_ILNNCOPY gotoblas -> ctrsm_ilnncopy
326336
#define CTRSM_ILTNCOPY gotoblas -> ctrsm_iltncopy
327337

328-
#define CGEMM_BETA gotoblas -> cgemm_beta
338+
#define CGEMM_BETA gotoblas -> cgemm_beta
329339
#define CGEMM_KERNEL_N gotoblas -> cgemm_kernel_n
330340
#define CGEMM_KERNEL_L gotoblas -> cgemm_kernel_l
331341
#define CGEMM_KERNEL_R gotoblas -> cgemm_kernel_r
332342
#define CGEMM_KERNEL_B gotoblas -> cgemm_kernel_b
333343

344+
#define CCOMM_KERNEL_N gotoblas -> ccomm_kernel_n
345+
#define CCOMM_KERNEL_L gotoblas -> ccomm_kernel_l
346+
#define CCOMM_KERNEL_R gotoblas -> ccomm_kernel_r
347+
#define CCOMM_KERNEL_B gotoblas -> ccomm_kernel_b
348+
334349
#define CTRMM_KERNEL_LN gotoblas -> ctrmm_kernel_LN
335350
#define CTRMM_KERNEL_LT gotoblas -> ctrmm_kernel_LT
336351
#define CTRMM_KERNEL_LR gotoblas -> ctrmm_kernel_LR

common_d.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@
100100
#define DTRMM_ILNNCOPY dtrmm_ilnncopy
101101
#define DTRMM_ILTUCOPY dtrmm_iltucopy
102102
#define DTRMM_ILTNCOPY dtrmm_iltncopy
103+
#define DCOMM_NCOPY dcomm_ncopy
104+
#define DCOMM_TCOPY dcomm_tcopy
103105

104106
#define DTRSM_IUNUCOPY dtrsm_iunucopy
105107
#define DTRSM_IUNNCOPY dtrsm_iunncopy
@@ -114,6 +116,7 @@
114116
#define DGEMM_BETA dgemm_beta
115117

116118
#define DGEMM_KERNEL dgemm_kernel
119+
#define DCOMM_KERNEL dcomm_kernel
117120

118121
#define DTRMM_KERNEL_LN dtrmm_kernel_LN
119122
#define DTRMM_KERNEL_LT dtrmm_kernel_LT
@@ -239,13 +242,17 @@
239242
#define DTRMM_IUTNCOPY gotoblas -> dtrmm_iutncopy
240243
#define DTRMM_ILNNCOPY gotoblas -> dtrmm_ilnncopy
241244
#define DTRMM_ILTNCOPY gotoblas -> dtrmm_iltncopy
245+
#define DCOMM_NCOPY gotoblas -> dcomm_ncopy
246+
#define DCOMM_TCOPY gotoblas -> dcomm_tcopy
247+
242248
#define DTRSM_IUNNCOPY gotoblas -> dtrsm_iunncopy
243249
#define DTRSM_IUTNCOPY gotoblas -> dtrsm_iutncopy
244250
#define DTRSM_ILNNCOPY gotoblas -> dtrsm_ilnncopy
245251
#define DTRSM_ILTNCOPY gotoblas -> dtrsm_iltncopy
246252

247-
#define DGEMM_BETA gotoblas -> dgemm_beta
253+
#define DGEMM_BETA gotoblas -> dgemm_beta
248254
#define DGEMM_KERNEL gotoblas -> dgemm_kernel
255+
#define DCOMM_KERNEL gotoblas -> dcomm_kernel
249256

250257
#define DTRMM_KERNEL_LN gotoblas -> dtrmm_kernel_LN
251258
#define DTRMM_KERNEL_LT gotoblas -> dtrmm_kernel_LT

common_level3.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2025 The OpenBLAS Project. */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -284,6 +285,8 @@ int strmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX
284285
int strmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
285286
int strmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
286287
int strmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
288+
int scomm_ncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
289+
int scomm_tcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
287290
int strmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
288291
int strmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
289292
int strmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
@@ -301,6 +304,8 @@ int dtrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG pos
301304
int dtrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
302305
int dtrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
303306
int dtrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
307+
int dcomm_tcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b);
308+
int dcomm_ncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b);
304309
int dtrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
305310
int dtrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
306311
int dtrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
@@ -335,6 +340,8 @@ int ctrmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX
335340
int ctrmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
336341
int ctrmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
337342
int ctrmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
343+
int ccomm_tcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
344+
int ccomm_ncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
338345
int ctrmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
339346
int ctrmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
340347
int ctrmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b);
@@ -352,6 +359,8 @@ int ztrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG pos
352359
int ztrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
353360
int ztrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
354361
int ztrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
362+
int zcomm_tcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b);
363+
int zcomm_ncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b);
355364
int ztrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
356365
int ztrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
357366
int ztrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b);
@@ -579,6 +588,8 @@ int bgemm_kernel(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 *
579588
int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
580589
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
581590
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
591+
int scomm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
592+
int dcomm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
582593

583594
#ifdef QUAD_PRECISION
584595
int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble *, xdouble *, BLASLONG);
@@ -728,6 +739,16 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float
728739
int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
729740
int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
730741

742+
int ccomm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
743+
int ccomm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
744+
int ccomm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
745+
int ccomm_kernel_b(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
746+
747+
int zcomm_kernel_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
748+
int zcomm_kernel_l(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
749+
int zcomm_kernel_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
750+
int zcomm_kernel_b(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
751+
731752
int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG);
732753
int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG);
733754
int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG);

common_macro.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,9 @@
407407
#define TRMM_IUTCOPY DTRMM_IUTUCOPY
408408
#define TRMM_ILNCOPY DTRMM_ILNUCOPY
409409
#define TRMM_ILTCOPY DTRMM_ILTUCOPY
410+
#define COMM_NCOPY DCOMM_NCOPY
411+
#define COMM_TCOPY DCOMM_TCOPY
412+
410413
#define TRSM_IUNCOPY DTRSM_IUNUCOPY
411414
#define TRSM_IUTCOPY DTRSM_IUTUCOPY
412415
#define TRSM_ILNCOPY DTRSM_ILNUCOPY
@@ -427,6 +430,9 @@
427430
#define TRMM_IUTCOPY DTRMM_IUTNCOPY
428431
#define TRMM_ILNCOPY DTRMM_ILNNCOPY
429432
#define TRMM_ILTCOPY DTRMM_ILTNCOPY
433+
#define COMM_NCOPY DCOMM_NCOPY
434+
#define COMM_TCOPY DCOMM_TCOPY
435+
430436
#define TRSM_IUNCOPY DTRSM_IUNNCOPY
431437
#define TRSM_IUTCOPY DTRSM_IUTNCOPY
432438
#define TRSM_ILNCOPY DTRSM_ILNNCOPY
@@ -441,6 +447,11 @@
441447
#define GEMM_KERNEL_R DGEMM_KERNEL
442448
#define GEMM_KERNEL_B DGEMM_KERNEL
443449

450+
#define COMM_KERNEL_N DCOMM_KERNEL
451+
#define COMM_KERNEL_L DCOMM_KERNEL
452+
#define COMM_KERNEL_R DCOMM_KERNEL
453+
#define COMM_KERNEL_B DCOMM_KERNEL
454+
444455
#define TRMM_KERNEL_LN DTRMM_KERNEL_LN
445456
#define TRMM_KERNEL_LT DTRMM_KERNEL_LT
446457
#define TRMM_KERNEL_LR DTRMM_KERNEL_LN
@@ -867,6 +878,9 @@
867878
#define TRMM_IUTCOPY STRMM_IUTUCOPY
868879
#define TRMM_ILNCOPY STRMM_ILNUCOPY
869880
#define TRMM_ILTCOPY STRMM_ILTUCOPY
881+
#define COMM_NCOPY SCOMM_NCOPY
882+
#define COMM_TCOPY SCOMM_TCOPY
883+
870884
#define TRSM_IUNCOPY STRSM_IUNUCOPY
871885
#define TRSM_IUTCOPY STRSM_IUTUCOPY
872886
#define TRSM_ILNCOPY STRSM_ILNUCOPY
@@ -887,6 +901,9 @@
887901
#define TRMM_IUTCOPY STRMM_IUTNCOPY
888902
#define TRMM_ILNCOPY STRMM_ILNNCOPY
889903
#define TRMM_ILTCOPY STRMM_ILTNCOPY
904+
#define COMM_NCOPY SCOMM_NCOPY
905+
#define COMM_TCOPY SCOMM_TCOPY
906+
890907
#define TRSM_IUNCOPY STRSM_IUNNCOPY
891908
#define TRSM_IUTCOPY STRSM_IUTNCOPY
892909
#define TRSM_ILNCOPY STRSM_ILNNCOPY
@@ -1118,6 +1135,9 @@
11181135
#define TRMM_IUTCOPY STRMM_IUTUCOPY
11191136
#define TRMM_ILNCOPY STRMM_ILNUCOPY
11201137
#define TRMM_ILTCOPY STRMM_ILTUCOPY
1138+
#define COMM_NCOPY SCOMM_NCOPY
1139+
#define COMM_TCOPY SCOMM_TCOPY
1140+
11211141
#define TRSM_IUNCOPY STRSM_IUNUCOPY
11221142
#define TRSM_IUTCOPY STRSM_IUTUCOPY
11231143
#define TRSM_ILNCOPY STRSM_ILNUCOPY
@@ -1138,6 +1158,9 @@
11381158
#define TRMM_IUTCOPY STRMM_IUTNCOPY
11391159
#define TRMM_ILNCOPY STRMM_ILNNCOPY
11401160
#define TRMM_ILTCOPY STRMM_ILTNCOPY
1161+
#define COMM_NCOPY SCOMM_NCOPY
1162+
#define COMM_TCOPY SCOMM_TCOPY
1163+
11411164
#define TRSM_IUNCOPY STRSM_IUNNCOPY
11421165
#define TRSM_IUTCOPY STRSM_IUTNCOPY
11431166
#define TRSM_ILNCOPY STRSM_ILNNCOPY
@@ -1152,6 +1175,11 @@
11521175
#define GEMM_KERNEL_R SGEMM_KERNEL
11531176
#define GEMM_KERNEL_B SGEMM_KERNEL
11541177

1178+
#define COMM_KERNEL_N SCOMM_KERNEL
1179+
#define COMM_KERNEL_L SCOMM_KERNEL
1180+
#define COMM_KERNEL_R SCOMM_KERNEL
1181+
#define COMM_KERNEL_B SCOMM_KERNEL
1182+
11551183
#define TRMM_KERNEL_LN STRMM_KERNEL_LN
11561184
#define TRMM_KERNEL_LT STRMM_KERNEL_LT
11571185
#define TRMM_KERNEL_LR STRMM_KERNEL_LN
@@ -1859,6 +1887,9 @@
18591887
#define TRMM_IUTCOPY ZTRMM_IUTUCOPY
18601888
#define TRMM_ILNCOPY ZTRMM_ILNUCOPY
18611889
#define TRMM_ILTCOPY ZTRMM_ILTUCOPY
1890+
#define COMM_NCOPY ZCOMM_NCOPY
1891+
#define COMM_TCOPY ZCOMM_TCOPY
1892+
18621893
#define TRSM_IUNCOPY ZTRSM_IUNUCOPY
18631894
#define TRSM_IUTCOPY ZTRSM_IUTUCOPY
18641895
#define TRSM_ILNCOPY ZTRSM_ILNUCOPY
@@ -1879,6 +1910,9 @@
18791910
#define TRMM_IUTCOPY ZTRMM_IUTNCOPY
18801911
#define TRMM_ILNCOPY ZTRMM_ILNNCOPY
18811912
#define TRMM_ILTCOPY ZTRMM_ILTNCOPY
1913+
#define COMM_NCOPY ZCOMM_NCOPY
1914+
#define COMM_TCOPY ZCOMM_TCOPY
1915+
18821916
#define TRSM_IUNCOPY ZTRSM_IUNNCOPY
18831917
#define TRSM_IUTCOPY ZTRSM_IUTNCOPY
18841918
#define TRSM_ILNCOPY ZTRSM_ILNNCOPY
@@ -1921,6 +1955,11 @@
19211955
#define GEMM_KERNEL_R ZGEMM_KERNEL_R
19221956
#define GEMM_KERNEL_B ZGEMM_KERNEL_B
19231957

1958+
#define COMM_KERNEL_N ZCOMM_KERNEL_N
1959+
#define COMM_KERNEL_L ZCOMM_KERNEL_L
1960+
#define COMM_KERNEL_R ZCOMM_KERNEL_R
1961+
#define COMM_KERNEL_B ZCOMM_KERNEL_B
1962+
19241963
#define GEMM3M_KERNEL ZGEMM3M_KERNEL
19251964

19261965
#define TRMM_KERNEL_LN ZTRMM_KERNEL_LN
@@ -2324,6 +2363,9 @@
23242363
#define TRMM_IUTCOPY CTRMM_IUTUCOPY
23252364
#define TRMM_ILNCOPY CTRMM_ILNUCOPY
23262365
#define TRMM_ILTCOPY CTRMM_ILTUCOPY
2366+
#define COMM_NCOPY CCOMM_NCOPY
2367+
#define COMM_TCOPY CCOMM_TCOPY
2368+
23272369
#define TRSM_IUNCOPY CTRSM_IUNUCOPY
23282370
#define TRSM_IUTCOPY CTRSM_IUTUCOPY
23292371
#define TRSM_ILNCOPY CTRSM_ILNUCOPY
@@ -2344,6 +2386,9 @@
23442386
#define TRMM_IUTCOPY CTRMM_IUTNCOPY
23452387
#define TRMM_ILNCOPY CTRMM_ILNNCOPY
23462388
#define TRMM_ILTCOPY CTRMM_ILTNCOPY
2389+
#define COMM_NCOPY CCOMM_NCOPY
2390+
#define COMM_TCOPY CCOMM_TCOPY
2391+
23472392
#define TRSM_IUNCOPY CTRSM_IUNNCOPY
23482393
#define TRSM_IUTCOPY CTRSM_IUTNCOPY
23492394
#define TRSM_ILNCOPY CTRSM_ILNNCOPY
@@ -2386,6 +2431,11 @@
23862431
#define GEMM_KERNEL_R CGEMM_KERNEL_R
23872432
#define GEMM_KERNEL_B CGEMM_KERNEL_B
23882433

2434+
#define COMM_KERNEL_N CCOMM_KERNEL_N
2435+
#define COMM_KERNEL_L CCOMM_KERNEL_L
2436+
#define COMM_KERNEL_R CCOMM_KERNEL_R
2437+
#define COMM_KERNEL_B CCOMM_KERNEL_B
2438+
23892439
#define GEMM3M_KERNEL CGEMM3M_KERNEL
23902440

23912441
#define TRMM_KERNEL_LN CTRMM_KERNEL_LN

0 commit comments

Comments
 (0)