WOQ INT8: pad N if it is not a multiple of block_n (#3370)

Xia-Weiwen · web-flow · commit 03798918c9a4 · 2024-11-24T20:58:31.000-08:00
* WOQ INT8: pad N if it is not a multiple of block_n

* Update UT

* Fix unpack

* Fix unpack issue with padded_N
diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp
@@ -404,7 +404,10 @@ at::Tensor woq_linear_pack_weight(
           kCPU, weight_int4, weight_dtype, block_n, block_k, lowp_mode);
     }
     if (N % block_n) {
-      return weight;
+      at::Tensor weight_padded =
+          at::pad(weight, {0, 0, 0, block_n - N % block_n}, "constant", 0);
+      return woq_tpp_gemm_packB_stub(
+          kCPU, weight_padded, weight_dtype, block_n, block_k, lowp_mode);
     } else {
       return woq_tpp_gemm_packB_stub(
           kCPU, weight, weight_dtype, block_n, block_k, lowp_mode);
diff --git a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp
@@ -725,24 +725,16 @@ at::Tensor unpack(ContextLinearWoq& context, const at::Tensor& tensor) {
     unpacked_weight = woq_shuffle_weight_back_by_group_idx(
         unpacked_weight, context.weight_shape_, g_idx, group_size);
   }
-  if (tensor.dim() > 2) {
-    auto scales = context.scales_list_[0];
-    auto zero_points = context.zero_points_list_[0];
-    if (context.is_4bit_) {
-      auto unpacked_shape = unpacked_weight.sizes().vec(); // = N * K/2
-      auto shape = context.weight_shape_;
-      shape.back() /= 2;
-      at::Tensor qweight =
-          at::empty(shape, device(c10::kCPU).dtype(c10::kByte));
-      assert(qweight.numel() % 2 == 0);
-      std::memcpy(
-          qweight.data_ptr(), unpacked_weight.data_ptr(), qweight.numel());
-      return qweight;
-    } else { // int8
-      return unpacked_weight;
-    }
+  auto shape = context.weight_shape_;
+  if (context.is_4bit_) {
+    shape.back() = (shape.back() + 1) / 2;
   }
-  return unpacked_weight;
+  // weight may be padded. Copy data according to original shape
+  at::Tensor qweight =
+      at::empty(shape, device(c10::kCPU).dtype(unpacked_weight.scalar_type()));
+  assert(qweight.numel() % 2 == 0);
+  std::memcpy(qweight.data_ptr(), unpacked_weight.data_ptr(), qweight.numel());
+  return qweight;
 }
 
 template <typename T, typename Tg, bool is_4bit = false>
diff --git a/csrc/cpu/jit/cpu/kernels/OpContext.cpp b/csrc/cpu/jit/cpu/kernels/OpContext.cpp
@@ -427,6 +427,12 @@ at::Tensor IpexWoqLinearOpContext::get_at_packed_weight() {
 }
 
 c10::optional<at::Tensor> IpexWoqLinearOpContext::get_at_bias() {
+  if (op_context_.at_bias_.has_value()) {
+    auto b = op_context_.at_bias_.value();
+    if (b.size(0) > op_context_.weight_shape_[0]) {
+      return c10::make_optional(b.narrow(0, 0, op_context_.weight_shape_[0]));
+    }
+  }
   return op_context_.at_bias_;
 }
 
diff --git a/csrc/cpu/jit/cpu/kernels/OpContext.h b/csrc/cpu/jit/cpu/kernels/OpContext.h
@@ -383,7 +383,7 @@ class WoqLinearOpContext : public torch::jit::CustomClassHolder {
     auto orig_weight_ = this->to_public(this->get_at_packed_weight());
     auto weight_dtype_ = this->get_context().weight_dtype_;
     auto weight_shape_ = this->get_weight_shape();
-    auto orig_bias_ = this->get_context().at_bias_;
+    auto orig_bias_ = this->get_at_bias();
     auto scales = this->get_scales();
     auto zero_points = this->get_zero_points();
     auto g_idx = this->get_g_idx();
diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py
@@ -989,7 +989,7 @@ def test(feature, has_bias, w_dtype):
         shape_list = [
             [3, 31, 31],
             [4, 4096, 4096],
-            [9, 4095, 4095],
+            [4, 4096, 4080],
             [196, 4095, 16383],
             [1024, 512, 512],
         ]
@@ -2260,6 +2260,64 @@ def test(feature, has_bias, w_dtype, lowp_mode, enable_amp):
         for shape, use_bias, w_dtype, lowp_mode, enable_amp in cases:
             test(shape, use_bias, w_dtype, lowp_mode, enable_amp)
 
+    def test_weight_padding(self):
+        """
+        If N of weight shape N * K is not a multiple of block_n, it is padded to be a multiple of block_n.
+        """
+
+        class Mod(nn.Module):
+            def __init__(self, input_channel, output_channel, has_bias):
+                super(Mod, self).__init__()
+                self.linear = torch.nn.Linear(input_channel, output_channel, has_bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        def test(M, has_bias, w_dtype):
+            N, K, N_padded = 500, 512, 512
+            model = Mod(K, N, has_bias)
+            m = model.eval()
+            m_ref = Mod(K, N_padded, False).eval()
+            data = torch.rand(M, K)
+            weight = model.linear.weight
+            weight_int4, w_scales, w_zero_points = quantize_per_channel(
+                weight,
+                w_dtype,
+                sym_quant=True if w_dtype == WoqWeightDtype.NF4 else False,
+            )
+            weight_fp32 = dequantize_per_channel(
+                weight_int4, w_scales, w_zero_points, w_dtype, weight.shape
+            )
+            if has_bias:
+                bias = model.linear.bias
+                output1 = torch.matmul(data, weight_fp32.T) + bias
+            else:
+                output1 = torch.matmul(data, weight_fp32.T)
+
+            qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+                weight_dtype=w_dtype
+            )
+            prepared_model = prepare(m, qconfig, example_inputs=data, inplace=False)
+            prepared_model_ref = prepare(
+                m_ref, qconfig, example_inputs=data, inplace=False
+            )
+            with torch.no_grad():
+                woq_model = convert(prepared_model)
+                woq_model_ref = convert(prepared_model_ref)
+                assert (
+                    woq_model.linear.weight.shape == woq_model_ref.linear.weight.shape
+                )
+
+                output2 = woq_model(data)
+                torch.testing.assert_close(output1, output2)
+
+        M_list = [4, 1024]
+        use_bias_list = [True, False]
+        w_dtype_list = [WoqWeightDtype.INT8, WoqWeightDtype.INT4, WoqWeightDtype.NF4]
+        cases = itertools.product(M_list, use_bias_list, w_dtype_list)
+        for M, use_bias, w_dtype in cases:
+            test(M, use_bias, w_dtype)
+
 
 if __name__ == "__main__":
     test = unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -427,6 +427,12 @@ at::Tensor IpexWoqLinearOpContext::get_at_packed_weight() {`
`427`	`427`	`}`
`428`	`428`
`429`	`429`	`c10::optional<at::Tensor> IpexWoqLinearOpContext::get_at_bias() {`
	`430`	`+ if (op_context_.at_bias_.has_value()) {`
	`431`	`+ auto b = op_context_.at_bias_.value();`
	`432`	`+ if (b.size(0) > op_context_.weight_shape_[0]) {`
	`433`	`+ return c10::make_optional(b.narrow(0, 0, op_context_.weight_shape_[0]));`
	`434`	`+ }`
	`435`	`+ }`
`430`	`436`	`return op_context_.at_bias_;`
`431`	`437`	`}`
`432`	`438`