fixes

HDCharles · HDCharles · commit 0f265f9b7b6e · 2025-12-03T18:45:09.000Z
Summary

Signed-off-by: HDCharles &lt;charlesdavidhernandez@gmail.com&gt;
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -10,12 +10,13 @@
     match_modules_set,
     match_named_modules,
     update_offload_parameter,
+    get_lowest_common_ancestor_name,
 )
 from loguru import logger
 from pydantic import ConfigDict, PrivateAttr, model_validator
 from torch.nn import Module
 from tqdm import tqdm
-
+from torch.utils._pytree import tree_flatten
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.awq.mappings import (
@@ -332,12 +333,20 @@ def _set_resolved_mappings(self, model: Module) -> None:
             module_to_name[module] = name
 
         for mapping in self.mappings:
-            target_patterns = (mapping.smooth_layer, *mapping.balance_layers)
-
-            for smooth_layer, *balance_layers in match_modules_set(
-                model, target_patterns, self.ignore
+            for smooth_layers, *nested_balance_layers in match_modules_set(
+                model, (mapping.smooth_layer, *mapping.balance_layers), self.ignore
             ):
+                assert len(smooth_layers)==1, (
+                    "AWQ mappings need to match a single smoothlayer for each mapping but got "
+                    f"{[module_to_name.get(smooth_layer) for smooth_layer in smooth_layers]} "
+                    f"when matching {mapping.smooth_layer}"
+                )
+                smooth_layer = smooth_layers[0]
                 smooth_name = module_to_name.get(smooth_layer)
+
+                #[[b00, b01, b02...], [b10, b11, b12,...], ...] v
+                #                             [b00, b01, b02, ..., b10, b11, b12, ...]
+                balance_layers = tree_flatten(nested_balance_layers)[0]
                 balance_names = [
                     module_to_name.get(balance_layer)
                     for balance_layer in balance_layers
@@ -361,16 +370,17 @@ def _set_resolved_mappings(self, model: Module) -> None:
                     continue
                 else:
                     # for multiple balance layers, find lowest common parent
-                    parent_name, parent = get_lowest_common_module(balance_names, model)
+                    ancestor_name = get_lowest_common_ancestor_name(balance_names)
+                    ancestor, ancestor_name = get_lowest_non_module_list_ancestor(ancestor_name, )
 
                 resolved_mappings.append(
                     ResolvedMapping(
                         smooth_name,
                         smooth_layer,
                         balance_layers,
                         balance_names=balance_names,
-                        parent=parent,
-                        parent_name=parent_name,
+                        parent=ancestor,
+                        parent_name=ancestor_name,
                     )
                 )
         self._resolved_mappings = resolved_mappings
@@ -795,45 +805,25 @@ def _accumulate_mean(
     return (prev_sum + sum_added) / new_count, new_count
 
 
-def get_lowest_common_module(names: list[str], module: Module) -> tuple[str, Module]:
+def get_lowest_non_module_list_ancestor(name, module: Module) -> tuple[str, Module]:
     """
-    Given a list of names, returns the lowest-scope common module.
+    Given a name: foo.bar.baz, finds lowest ancestor that's not a ModuleList
+    i.e. module_list.module_dict.module_list -> module_list.module_dict
+    i.e. module_list.module_dict -> module_list.module_dict
+    (self is an ancestor of self)
 
-    NOTE: function excludes modules of type ModuleList, which don't play
+    NOTE: This is needed because ModuleLists don't play
     nicely with hooks because their forward method is never directly
     called for MoE models. See Qwen3MoeSparseMoeBlock for example, experts
     are selected based on router output and their forward method is called.
     https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py#L233
 
     Returns name of module and pointer to module
-
-    Implementation is a small alteration of os.path.commonprefix
-    https://docs.python.org/3/library/os.path.html#os.path.commonprefix
     """
-    # adding "." before and after allows for handling a lot of corner
-    # cases which were previously mishandled ([case]->prefix->result)
-    # case 0: single module: [.abc.] -> .abc. -> abc
-    # case 1: substring modules: [.abc., .ab.] -> .ab -> ""
-    # case 2: parent & child: [.ab., .ab.a.] -> .ab. -> ab
-    s1 = min(names) + "."
-    s2 = max(names) + "."
-
-    # 1) find longest shared prefix
-    parent_name = "."
-    for i, c in enumerate(s1):
-        if c != s2[i]:
-            break
-        parent_name += c
-
-    # 2) throw away module name fragment and leading dot
-    # ".keep.thro" -> "keep"
-    parent_name = parent_name[1 : parent_name.rfind(".")]
-
-    # 3) return first common module that is not a module list
     while True:
-        if parent_name == "":
+        if name == "":
             return "", module
-        parent = get_layer_by_name(parent_name, module)
-        if not isinstance(parent, torch.nn.ModuleList):
-            return parent_name, parent
-        parent_name = ".".join(parent_name.split(".")[:-1])
+        module = get_layer_by_name(name, module)
+        if not isinstance(module, torch.nn.ModuleList):
+            return name, module
+        name = ".".join(parent_name.split(".")[:-1])
diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py
@@ -9,6 +9,7 @@
     TransformScheme,
     apply_transform_config,
 )
+from torch.utils._pytree import tree_flatten
 from compressed_tensors.utils import TorchDtype, get_head_dim
 from pydantic import Field, ValidationInfo, field_validator
 from transformers import PreTrainedModel
@@ -203,8 +204,10 @@ def _fuse_norms(self, model: PreTrainedModel):
         for mapping in self.norm_mappings:
             for norm, *linears in match_modules_set(
                 model, (mapping.norm, *mapping.linears)
-            ):
-                fuse_norm_linears(norm, linears)
+            ): 
+                # match_modules_set returns a list of lists
+                assert len(norm) == 1
+                fuse_norm_linears(norm[0], tree_flatten(linears)[0])
 
     def _create_r1_scheme(self) -> TransformScheme:
         return TransformScheme(
diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py
@@ -5,7 +5,7 @@
 from torch.nn import Linear
 
 from llmcompressor.modifiers.awq import AWQMapping, AWQModifier
-from llmcompressor.modifiers.awq.base import get_lowest_common_module
+from llmcompressor.modifiers.awq.base import get_lowest_non_module_list_ancestor
 from llmcompressor.modifiers.factory import ModifierFactory
 
 
@@ -47,11 +47,18 @@ def test_set_resolved_mappings():
             "o_proj": Linear(4, 4),
         }
     )
-    mlp = torch.nn.ModuleDict(
-        {
-            "up_proj": Linear(4, 10),
-            "down_proj": Linear(10, 4),
-        }
+    mlp = torch.nn.ModuleList(
+        "experts": torch.nn.ModuleList(
+            [
+                torch.nn.ModuleDict(
+                    {
+                        "gate_proj": Linear(4, 2),
+                        "down_proj": Linear(4, 2),
+                    }
+                )
+                for _ in range(3)
+            ]
+        )
     )
     model = torch.nn.ModuleDict(
         {
@@ -83,8 +90,8 @@ def test_set_resolved_mappings():
             assert set(mapping.balance_names) == {"decoder.self_attn.o_proj"}
             assert mapping.parent_name == "decoder.self_attn.o_proj"
         if "mlp.up_proj" in mapping.smooth_name:
-            assert set(mapping.balance_names) == {"decoder.mlp.down_proj"}
-            assert mapping.parent_name == "decoder.mlp.down_proj"
+            assert set(mapping.balance_names) == {"decoder.mlp.0.down_proj", "decoder.mlp.0.down_proj", "decoder.mlp.0.down_proj"}
+            assert mapping.parent_name == "decoder.mlp.down_proj" # TODODODO
 
     awq = AWQModifier(
         mappings=[
@@ -193,72 +200,35 @@ def test_validate():
 
 
 @pytest.mark.unit
-def test_get_lowest_common_module():
-    mlp = torch.nn.ModuleDict(
+def test_get_lowest_non_module_list_ancestor():
+    model = torch.nn.ModuleDict(
         {
             "experts": torch.nn.ModuleList(
                 [
                     torch.nn.ModuleDict(
                         {
                             "gate_proj": Linear(4, 2),
-                            "down_proj": Linear(4, 2),
+                            "down_proj": Linear(2, 4),
                         }
                     )
                     for _ in range(10)
                 ]
             )
         }
     )
-    self_attn = torch.nn.ModuleDict(
-        {
-            "q_proj": Linear(4, 2),
-            "k_proj": Linear(4, 2),
-            "v_proj": Linear(4, 2),
-            "o_proj": Linear(4, 4),
-        }
+    
+    ancestor_name, ancestor = get_lowest_non_module_list_ancestor(
+        "", model
     )
-    model = torch.nn.ModuleDict(
-        {
-            "embed_tokens": Linear(4, 2),
-            "decoder": torch.nn.ModuleDict(
-                {
-                    "self_attn": self_attn,
-                    "mlp": mlp,
-                }
-            ),
-        }
-    )
-
-    parent_name, parent = get_lowest_common_module(
-        ["decoder.mlp.experts.1.gate_proj", "decoder.mlp.experts.4.down_proj"], model
-    )
-    assert parent_name == "decoder.mlp" and parent == mlp
-
-    parent_name, parent = get_lowest_common_module(
-        ["decoder.self_attn.q_proj", "decoder.self_attn.v_proj"], model
-    )
-    assert parent_name == "decoder.self_attn" and parent == self_attn
+    assert ancestor_name == "" and ancestor == model
 
-    parent_name, parent = get_lowest_common_module(
-        ["decoder.mlp.experts.1.gate_proj", "decoder.self_attn.v_proj"], model
+    ancestor_name, ancestor = get_lowest_non_module_list_ancestor(
+        ["experts"], model
     )
-    assert parent_name == "decoder" and parent == model["decoder"]
+    assert ancestor_name == "" and ancestor == model
 
-    parent_name, parent = get_lowest_common_module(
-        ["embed_tokens", "decoder.self_attn.v_proj"], model
+    ancestor_name, ancestor = get_lowest_non_module_list_ancestor(
+        "experts.1.gate_proj", model
     )
-    assert parent_name == "" and parent == model
+    assert ancestor_name == "experts.1.gate_proj" and ancestor == model["experts"][1]["gate_proj"]
 
-    m = torch.nn.ModuleDict(
-        {
-            "abc": Linear(3, 3),
-            "ab": torch.nn.ModuleDict({"a": Linear(3, 3)}),
-            "z": Linear(3, 3),
-        }
-    )
-    parent_name, parent = get_lowest_common_module(["abc", "ab"], m)
-    assert parent_name == ""
-    parent_name, parent = get_lowest_common_module(["ab", "ab.a"], m)
-    assert parent_name == "ab"
-    parent_name, parent = get_lowest_common_module(["z"], m)
-    assert parent_name == "z"