fix tests

kylesayrs · kylesayrs · commit fd88038f0a59 · 2025-11-03T16:46:13.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/weights_ptq/__init__.py b/src/llmcompressor/entrypoints/weights_ptq/__init__.py
@@ -91,10 +91,9 @@ def _process_file(
 
     for name in list(tensors.keys()):
         module_name, param_name = name.rsplit(".", 1)
+        is_linear_weight = param_name == "weight" and not module_name.endswith("norm")
         is_ignored = any(_match_name(module_name, ign) for ign in ignore)
-        is_weight = param_name == "weight"
-        if is_ignored or not is_weight:
-            print(f"skip {name}")
+        if not is_linear_weight or is_ignored:
             continue
 
         # 1. initialize module with qparams (on device)
diff --git a/src/llmcompressor/entrypoints/weights_ptq/lifecycle.py b/src/llmcompressor/entrypoints/weights_ptq/lifecycle.py
@@ -15,7 +15,11 @@
     update_weight_zp_scale,
 )
 
-__all__ = ["initialize_quantized_linear", "calibrate_weights", "compress_module"]
+__all__ = [
+    "initialize_quantized_linear",
+    "calibrate_weights",
+    "compress_module",
+]
 
 
 def initialize_quantized_linear(
@@ -58,6 +62,8 @@ def compress_module(module: torch.nn.Linear):
         global_scale=getattr(module, "weight_global_scale", None),
     )
 
+    # `compress_weight` is a messy api
+    delattr(module, "weight")
     for key, value in data.items():
         if hasattr(module, key):
             getattr(module, key).data = value
diff --git a/tests/llmcompressor/pipelines/test_ptq_weights.py b/tests/llmcompressor/pipelines/test_ptq_weights.py
@@ -11,6 +11,20 @@
 from tests.testing_utils import requires_gpu
 
 
+def _get_tiny_w4a16_quant():
+    return QuantizationScheme(
+        targets=["Linear"],
+        weights=QuantizationArgs(
+            num_bits=4,
+            type="int",
+            strategy="group",
+            group_size=16,
+            symmetric=True,
+            dynamic=False,
+        ),
+    )
+
+
 def _get_tiny_block_quant():
     return QuantizationScheme(
         targets=["Linear"],
@@ -26,10 +40,12 @@ def _get_tiny_block_quant():
 
 
 @requires_gpu
-@pytest.mark.parametrize("scheme", ["FP8_dynamic", _get_tiny_block_quant()])
+@pytest.mark.parametrize(
+    "scheme", [_get_tiny_w4a16_quant(), "FP8_dynamic", _get_tiny_block_quant()]
+)
 def test_weights_ptq_e2e(scheme, tmp_path):
     model = "nm-testing/tinysmokellama-3.2"
-    ignore = ["model.embed_tokens", "lm_head", "re:.*norm$"]
+    ignore = ["model.embed_tokens", "lm_head"]
     device = "cuda:0"
 
     ptq_outdir = tmp_path / "weights_out"