update row_major for origin PVC/ARC template

sunjiweiswift · sunjiweiswift · commit 0ebd89065bd6 · 2024-06-17T22:00:59.000+08:00
diff --git a/include/common/core/base_consts.hpp b/include/common/core/base_consts.hpp
@@ -25,7 +25,6 @@ namespace gpu::xetla {
 
 /// @addtogroup xetla_core_base_consts
 /// @{
-enum quant_mode : uint8_t { S4_ASYM, S4_FULLRANGE_NO_ZP };
 /// @} xetla_core_base_consts
 
 } // namespace gpu::xetla
diff --git a/include/common/core/common_types.hpp b/include/common/core/common_types.hpp
@@ -26,4 +26,13 @@ enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2 };
 enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };
 
 enum class mem_layout : uint8_t { row_major = 0, col_major = 1 };
+
+enum class quant_mode : uint8_t { S4_ASYM = 0, S4_FULLRANGE_NO_ZP = 1 };
+
+struct quant_info {
+  quant_mode quant_mode;
+  uint32_t dequant_s;
+  mem_layout weight_mem_layout;
+};
+
 } // namespace gpu::xetla
diff --git a/include/experimental/group/gemm/compute_policy.hpp b/include/experimental/group/gemm/compute_policy.hpp
@@ -31,8 +31,7 @@ template <
     typename perf_tuning_knob_,
     typename dtype_scale_,
     typename dtype_zero_pt_,
-    quant_mode quant_type_,
-    uint32_t dequant_s_,
+    quant_info quant_info_,
     mma_engine mma_engine_ = mma_engine::xmx,
     gpu_arch arch_tag_ = gpu_arch::XeHpc,
     typename enable = void>
@@ -44,17 +43,15 @@ template <
     typename perf_tuning_knob_,
     typename dtype_scale_,
     typename dtype_zero_pt_,
-    quant_mode quant_type_,
-    int dequant_s_,
+    quant_info quant_info_,
     mma_engine mma_engine_,
     gpu_arch arch_tag_>
 struct compute_policy_int4_dequantize<
     compute_attr_,
     perf_tuning_knob_,
     dtype_scale_,
     dtype_zero_pt_,
-    quant_type_,
-    dequant_s_,
+    quant_info_,
     mma_engine_,
     arch_tag_,
     std::enable_if_t<mma_engine_ == mma_engine::xmx>> {
@@ -70,17 +67,17 @@ struct compute_policy_int4_dequantize<
   static constexpr mma_engine mma_engine = mma_engine_;
   static constexpr gpu_arch arch_tag = arch_tag_;
 
-  static_assert(arch_has_xmx<arch_tag>(), "XeLpg does not support xmx");
+  static_assert(arch_has_xmx<arch_tag>, "XeLpg does not support xmx");
 
   static constexpr bool is_int4_matB_policy = true;
 
-  static constexpr uint32_t dequant_s = dequant_s_;
+  static constexpr uint32_t dequant_s = quant_info_.dequant_s;
   static_assert(
       (dequant_s % (32 / sizeof(dtype_mma_b))) == 0,
       "dequant_s should be a multiply of 32B");
   using dtype_scale = dtype_scale_;
   using dtype_zero_pt = dtype_zero_pt_;
-  static constexpr quant_mode quant_type = quant_type_;
+  static constexpr quant_mode quant_mode = quant_info_.quant_mode;
 
   static constexpr uint32_t block_size_y_a = 16;
   using mma_attr = mma_attr_t<arch_tag_, block_size_y_a>;
@@ -103,17 +100,15 @@ template <
     typename perf_tuning_knob_,
     typename dtype_scale_,
     typename dtype_zero_pt_,
-    quant_mode quant_type_,
-    int dequant_s_,
+    quant_info quant_info_,
     mma_engine mma_engine_,
     gpu_arch arch_tag_>
 struct compute_policy_int4_dequantize<
     compute_attr_,
     perf_tuning_knob_,
     dtype_scale_,
     dtype_zero_pt_,
-    quant_type_,
-    dequant_s_,
+    quant_info_,
     mma_engine_,
     arch_tag_,
     std::enable_if_t<mma_engine_ == mma_engine::fpu>> {
@@ -131,20 +126,22 @@ struct compute_policy_int4_dequantize<
 
   static constexpr bool is_int4_matB_policy = true;
 
-  static constexpr uint32_t dequant_s = dequant_s_;
+  static constexpr uint32_t dequant_s = quant_info_.dequant_s;
   static_assert(
       (dequant_s % (32 / sizeof(dtype_mma_b))) == 0,
       "dequant_s should be a multiply of 32B");
   using dtype_scale = dtype_scale_;
   using dtype_zero_pt = dtype_zero_pt_;
-  static constexpr quant_mode quant_type = quant_type_;
+  static constexpr quant_mode quant_mode = quant_info_.quant_mode;
+  static constexpr bool is_col_major_b =
+      quant_info_.weight_mem_layout == mem_layout::col_major;
 
-  static constexpr uint32_t block_size_y_a = 4;
-  static constexpr uint32_t block_bytes_x_a = 256;
+  static constexpr uint32_t block_size_y_a = is_col_major_b ? 8 : 16;
+  static constexpr uint32_t block_bytes_x_a = is_col_major_b ? 256 : 32;
   static constexpr uint32_t block_size_x_a =
       block_bytes_x_a / sizeof(dtype_mma_a);
-  static constexpr uint32_t block_size_x_b = 1;
-  static constexpr uint32_t block_bytes_y_b = 256;
+  static constexpr uint32_t block_size_x_b = is_col_major_b ? 1 : 32;
+  static constexpr uint32_t block_bytes_y_b = is_col_major_b ? 256 : 32;
   static constexpr uint32_t block_size_y_b =
       block_bytes_y_b / sizeof(dtype_mma_b);
 
diff --git a/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp b/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp
@@ -36,8 +36,7 @@ template <
     typename mem_desc_b_t_,
     typename dtype_scale_,
     typename dtype_zero_pt_,
-    uint32_t dequant_s_,
-    quant_mode quant_type_,
+    quant_info quant_info_,
     mma_engine mma_engine_,
     typename pre_processing_t_,
     gpu_arch arch_tag_>
@@ -47,8 +46,7 @@ class gemm_t<
         perf_tuning_knob_,
         dtype_scale_,
         dtype_zero_pt_,
-        quant_type_,
-        dequant_s_,
+        quant_info_,
         mma_engine_,
         arch_tag_>,
     tile_shape_, // tile shape of workgroup-level gemm
@@ -66,8 +64,7 @@ class gemm_t<
       perf_tuning_knob_,
       dtype_scale_,
       dtype_zero_pt_,
-      quant_type_,
-      dequant_s_,
+      quant_info_,
       mma_engine_,
       arch_tag_>;
   static constexpr uint32_t k_stride = compute_policy::k_stride;
@@ -80,6 +77,7 @@ class gemm_t<
 
   constexpr static gpu_arch arch_tag = compute_policy::arch_tag;
   static constexpr uint32_t dequant_s = compute_policy::dequant_s;
+  static constexpr quant_mode quant_mode = compute_policy::quant_mode;
   using dtype_b = typename mem_desc_b_t::dtype;
   using dtype_zero_pt = typename compute_policy::dtype_zero_pt;
   static constexpr uint32_t pack_ratio = sizeof(dtype_b) * 2;
@@ -328,7 +326,7 @@ class gemm_t<
       scale_t,
       zero_pt_t,
       dequant_s,
-      quant_type_>;
+      quant_mode>;
   static constexpr bool enable_periodic_sync = (sync_freq != 0);
   static constexpr uint32_t barrier_count_x = wg_size_y > 1 ? wg_size_x : 0;
   static constexpr uint32_t barrier_count_y = wg_size_x > 1 ? wg_size_y : 0;
@@ -531,7 +529,7 @@ class gemm_t<
       subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
           scale_prefetch_payload);
       if constexpr (
-          compute_policy::quant_type != quant_mode::S4_FULLRANGE_NO_ZP) {
+          compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
         // TODO 1D prefetch need pack to U32/U64
         subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
             zero_pt_prefetch_payload);
@@ -545,7 +543,7 @@ class gemm_t<
         scale_prefetch_payload.template update_tdesc<update_dir_b>(
             scale_t::tile_size_y);
         if constexpr (
-            compute_policy::quant_type != quant_mode::S4_FULLRANGE_NO_ZP) {
+            compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
           zero_pt_prefetch_payload
               .template update_tdesc<tdesc_update_dir::y_dir>(
                   zero_pt_t::tile_size_y);
@@ -575,7 +573,7 @@ class gemm_t<
       subgroup::tile_load<cache_hint::cached, cache_hint::cached>(
           scale, scale_payload);
       if constexpr (
-          compute_policy::quant_type != quant_mode::S4_FULLRANGE_NO_ZP) {
+          compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
         subgroup::tile_load<cache_hint::cached, cache_hint::cached>(
             zero_pt, zero_pt_payload);
       }
@@ -590,7 +588,7 @@ class gemm_t<
         subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
             scale_prefetch_payload);
         if constexpr (
-            compute_policy::quant_type != quant_mode::S4_FULLRANGE_NO_ZP) {
+            compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
           // TODO 1D prefetch need pack to U32/U64
           subgroup::tile_prefetch<cache_hint::cached, cache_hint::cached>(
               zero_pt_prefetch_payload);
@@ -604,7 +602,7 @@ class gemm_t<
         scale_payload.template update_tdesc<update_dir_b>(scale_t::tile_size_y);
       }
       if constexpr (
-          compute_policy::quant_type != quant_mode::S4_FULLRANGE_NO_ZP) {
+          compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
         if (tile_k_idx % zero_pt_addr_update_freq == 0) {
           zero_pt_payload.template update_tdesc<tdesc_update_dir::y_dir>(
               zero_pt_t::tile_size_y);
@@ -619,7 +617,7 @@ class gemm_t<
           scale_prefetch_payload.template update_tdesc<tdesc_update_dir::y_dir>(
               scale_t::tile_size_y);
           if constexpr (
-              compute_policy::quant_type != quant_mode::S4_FULLRANGE_NO_ZP) {
+              compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
             zero_pt_prefetch_payload
                 .template update_tdesc<tdesc_update_dir::y_dir>(
                     zero_pt_t::tile_size_y);
@@ -717,7 +715,7 @@ class gemm_t<
   //                 (offset_y_in_tile) / dequant_s * scale_t::block_size_x +
   //                 offset_x_in_tile;
 
-  //             if constexpr (compute_policy::quant_type ==
+  //             if constexpr (compute_policy::quant_mode ==
   //             quant_mode::S4_ASYM) {
   //               uint32_t zero_pt_idx =
   //                   offset_y_in_tile / dequant_s * zero_pt_t::block_size_x +
@@ -739,7 +737,7 @@ class gemm_t<
   //                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b +
   //                   ii) - zero_pt_i8;
   //             } else if constexpr (
-  //                 compute_policy::quant_type ==
+  //                 compute_policy::quant_mode ==
   //                 quant_mode::S4_FULLRANGE_NO_ZP) {
   //               cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) =
   //                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b +
@@ -791,7 +789,7 @@ class gemm_t<
         xetla_vector<uint8_t, block_size_x_b * block_size_y_b> cvt_blk;
 
         xetla_vector<int32_t, block_size_x_b * block_size_y_b> cvt_blk_i32;
-        if constexpr (compute_policy::quant_type == quant_mode::S4_ASYM) {
+        if constexpr (compute_policy::quant_mode == quant_mode::S4_ASYM) {
           auto zero_pt_vec = zero_pt.reg
                                  .xetla_select<zero_pt_t::block_size_x, 1>(
                                      scale_block_id * zero_pt_t::block_size_x)
@@ -815,7 +813,7 @@ class gemm_t<
                zero_pt_blk.xetla_format<int8_t>());
         }
         if constexpr (
-            compute_policy::quant_type == quant_mode::S4_FULLRANGE_NO_ZP) {
+            compute_policy::quant_mode == quant_mode::S4_FULLRANGE_NO_ZP) {
           xetla_vector<int8_t, block_size_x_b * block_size_y_b> cvt_blk_i8;
           cvt_blk_i8.xetla_select<matB_t::block_elems, 2>(0) =
               matB_blk & 0x0f;
diff --git a/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp b/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
@@ -159,7 +159,7 @@ class gemm_universal_t<
   /// @brief GEMM arguments.
   /// This is the interface for users to pass the application-related runtime
   /// variables.
-  template <quant_mode quant_mode = S4_FULLRANGE_NO_ZP>
+  template <quant_mode quant_mode = quant_mode::S4_FULLRANGE_NO_ZP>
   struct arguments_t {
     /// @brief Is the size of the m dimension of the matrix multiplication (m x
     /// k x n).
@@ -295,7 +295,7 @@ class gemm_universal_t<
     }
   };
   template <>
-  struct arguments_t<S4_FULLRANGE_NO_ZP> {
+  struct arguments_t<quant_mode::S4_FULLRANGE_NO_ZP> {
     /// @brief Is the size of the m dimension of the matrix multiplication (m x
     /// k x n).
     uint32_t matrix_m;
@@ -566,7 +566,7 @@ class gemm_universal_t<
     implementable &=
         ((args.matB_ld % pack_ratio == 0) && (args.matrix_n % pack_ratio == 0));
     if constexpr (
-        gemm_t::compute_policy::quant_type != quant_mode::S4_FULLRANGE_NO_ZP) {
+        gemm_t::compute_policy::quant_mode != quant_mode::S4_FULLRANGE_NO_ZP) {
       implementable &= (args.zero_pt_ld % pack_ratio == 0);
     }
 
@@ -668,7 +668,7 @@ class gemm_universal_t<
     uint32_t inner_loop_count = (wg_tile_k + k_stride - 1) / k_stride;
     gemm_args_t gemm_args;
     if constexpr (
-        gemm_t::compute_policy::quant_type == quant_mode::S4_FULLRANGE_NO_ZP) {
+        gemm_t::compute_policy::quant_mode == quant_mode::S4_FULLRANGE_NO_ZP) {
       gemm_args = gemm_args_t(
           mem_desc_a,
           mem_desc_b,
diff --git a/include/subgroup/tile/impl/load_xe.hpp b/include/subgroup/tile/impl/load_xe.hpp
@@ -100,7 +100,7 @@ tile_load(tile_t& tile, payload_t& payload) {
   static constexpr bool reg_transpose = tile_desc::reg_transpose;
 
   static constexpr bool mem_transpose = payload_t::mem_transpose;
-  static constexpr bool trans = reg_transpose ^ mem_transpose;
+  static constexpr bool trans = payload_t::trans;
   static constexpr uint32_t scale_factor = payload_t::scale_factor;
 
   static constexpr bool mem_transform = payload_t::mem_transform;
@@ -535,9 +535,7 @@ tile_load(tile_t& tile, payload_t& payload) {
     // }
   }
 
-  if constexpr (
-      payload_t::trans &&
-      !(std::is_same_v<dtype, int4x2> || std::is_same_v<dtype, int4x8>)) {
+  if constexpr (payload_t::trans) {
     SW_BARRIER();
     tile_transpose(tile);
   }
@@ -604,9 +602,7 @@ tile_load(tile_t& tile, payload_t& payload) {
     }
   }
 
-  if constexpr (
-      payload_t::trans &&
-      !(std::is_same_v<dtype, int4x2> || std::is_same_v<dtype, int4x8>)) {
+  if constexpr (payload_t::trans) {
     SW_BARRIER();
     tile_transpose(tile);
   }
diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp
@@ -65,11 +65,14 @@ struct mem_payload_t<
       mem_payload_t<mem_desc_t, tile_desc, msg_type::block_2d, arch_tag>;
 
  public:
-  static constexpr bool mem_transpose = memory_layout == mem_layout::col_major;
+  static constexpr bool mem_transpose =
+      memory_layout == mem_layout::col_major &&
+      !(std::is_same_v<dtype_, int4x2> || std::is_same_v<dtype_, int4x8>);
 
   static constexpr reg_layout register_layout = tile_desc::register_layout;
   static constexpr bool reg_transpose =
       register_layout == reg_layout::transpose_tiled;
+
   static constexpr bool trans = mem_transpose ^ reg_transpose;
 
   static constexpr bool mem_transform = (sizeof(dtype) < 4) && !mem_transpose &&
diff --git a/include/subgroup/tile/impl/tile_op_functor.hpp b/include/subgroup/tile/impl/tile_op_functor.hpp
@@ -57,7 +57,7 @@ template <
     typename scale_t,
     typename zero_pt_t,
     uint32_t dequant_s,
-    quant_mode quant_type>
+    quant_mode quant_mode>
 struct dequant_int4_weight_t {
   struct arguments_t {
     uint32_t wg_start_m;
@@ -130,7 +130,7 @@ struct dequant_int4_weight_t {
                 (offset_y_in_tile) / dequant_s * scale_t::block_size_x +
                 offset_x_in_tile;
 
-            if constexpr (quant_type == quant_mode::S4_ASYM) {
+            if constexpr (quant_mode == quant_mode::S4_ASYM) {
               uint32_t zero_pt_idx =
                   offset_y_in_tile / dequant_s * zero_pt_t::block_size_x +
                   offset_x_in_tile / pack_ratio;
@@ -149,7 +149,7 @@ struct dequant_int4_weight_t {
               cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) =
                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) -
                   zero_pt_i8;
-            } else if constexpr (quant_type == quant_mode::S4_FULLRANGE_NO_ZP) {
+            } else if constexpr (quant_mode == quant_mode::S4_FULLRANGE_NO_ZP) {
               cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) =
                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) -
                   int8_t(8);
diff --git a/tests/integration/gemm/int4_dequantization/main.cpp b/tests/integration/gemm/int4_dequantization/main.cpp
diff --git a/tests/integration/gemm/int4_dequantization_bias/main_client.cpp b/tests/integration/gemm/int4_dequantization_bias/main_client.cpp
diff --git a/tests/integration/gemm/int4_dequantization_bias/main_xe.cpp b/tests/integration/gemm/int4_dequantization_bias/main_xe.cpp
diff --git a/tests/integration/gemv/int4/main.cpp b/tests/integration/gemv/int4/main.cpp

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ tile_load(tile_t& tile, payload_t& payload) {`
`100`	`100`	`static constexpr bool reg_transpose = tile_desc::reg_transpose;`
`101`	`101`
`102`	`102`	`static constexpr bool mem_transpose = payload_t::mem_transpose;`
`103`		`- static constexpr bool trans = reg_transpose ^ mem_transpose;`
	`103`	`+ static constexpr bool trans = payload_t::trans;`
`104`	`104`	`static constexpr uint32_t scale_factor = payload_t::scale_factor;`
`105`	`105`
`106`	`106`	`static constexpr bool mem_transform = payload_t::mem_transform;`
`@@ -535,9 +535,7 @@ tile_load(tile_t& tile, payload_t& payload) {`
`535`	`535`	`// }`
`536`	`536`	`}`
`537`	`537`
`538`		`- if constexpr (`
`539`		`- payload_t::trans &&`
`540`		`- !(std::is_same_v<dtype, int4x2> \|\| std::is_same_v<dtype, int4x8>)) {`
	`538`	`+ if constexpr (payload_t::trans) {`
`541`	`539`	`SW_BARRIER();`
`542`	`540`	`tile_transpose(tile);`
`543`	`541`	`}`
`@@ -604,9 +602,7 @@ tile_load(tile_t& tile, payload_t& payload) {`
`604`	`602`	`}`
`605`	`603`	`}`
`606`	`604`
`607`		`- if constexpr (`
`608`		`- payload_t::trans &&`
`609`		`- !(std::is_same_v<dtype, int4x2> \|\| std::is_same_v<dtype, int4x8>)) {`
	`605`	`+ if constexpr (payload_t::trans) {`
`610`	`606`	`SW_BARRIER();`
`611`	`607`	`tile_transpose(tile);`
`612`	`608`	`}`