vllm-project
diff --git a/‎setup.py‎
Lines changed: 1 addition & 2 deletions b/‎setup.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/llmcompressor/entrypoints/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llmcompressor/entrypoints/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llmcompressor/modifiers/autoround/base.py‎
Lines changed: 8 additions & 11 deletions b/‎src/llmcompressor/modifiers/autoround/base.py‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎src/llmcompressor/modifiers/awq/mappings.py‎
Lines changed: 1 addition & 0 deletions b/‎src/llmcompressor/modifiers/awq/mappings.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/llmcompressor/modifiers/quantization/quantization/mixin.py‎
Lines changed: 4 additions & 8 deletions b/‎src/llmcompressor/modifiers/quantization/quantization/mixin.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎src/llmcompressor/modifiers/transform/quip/base.py‎
Lines changed: 17 additions & 12 deletions b/‎src/llmcompressor/modifiers/transform/quip/base.py‎
Lines changed: 17 additions & 12 deletions
diff --git a/‎src/llmcompressor/modifiers/transform/spinquant/base.py‎
Lines changed: 20 additions & 8 deletions b/‎src/llmcompressor/modifiers/transform/spinquant/base.py‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎src/llmcompressor/pytorch/model_load/helpers.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llmcompressor/pytorch/model_load/helpers.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llmcompressor/transformers/compression/compressed_tensors_utils.py‎
Lines changed: 1 addition & 118 deletions b/‎src/llmcompressor/transformers/compression/compressed_tensors_utils.py‎
Lines changed: 1 addition & 118 deletions
diff --git a/‎src/llmcompressor/transformers/utils/helpers.py‎
Lines changed: 6 additions & 1 deletion b/‎src/llmcompressor/transformers/utils/helpers.py‎
Lines changed: 6 additions & 1 deletion
@@ -144,8 +144,7 @@ def localversion_func(version: ScmVersion) -> str:
             if BUILD_TYPE == "release"
             else "compressed-tensors>=0.12.3a2"
         ),
-        # TODO: replace it with the release version
-        ("auto_round @ git+https://github.com/intel/auto-round.git@llmc"),
+        ("auto-round==0.9.1"),
     ],
     extras_require={
         "dev": [
 
@@ -29,12 +29,12 @@
 from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.compression.compressed_tensors_utils import (
     modify_save_pretrained,
-    untie_word_embeddings,
 )
 from llmcompressor.transformers.utils.helpers import (
     is_model_ct_quantized_from_path,
 )
 from llmcompressor.typing import Processor
+from llmcompressor.utils import untie_word_embeddings
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
 
 
@@ -20,10 +20,8 @@
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
 from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_if_target_shared_embedding,
-)
-from llmcompressor.utils.pytorch.module import get_no_split_params
+from llmcompressor.utils import targets_embeddings, untie_word_embeddings
+from llmcompressor.utils.pytorch import get_no_split_params
 
 __all__ = ["AutoRoundModifier"]
 
@@ -107,9 +105,9 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     # AutoRound modifier arguments
     iters: int = 200
     enable_torch_compile: bool = True
+    batch_size: int = 8
 
     # private variables
-    _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
     _q_input: Optional[torch.Tensor] = PrivateAttr(default=None)
 
@@ -124,10 +122,6 @@ def on_initialize(self, state: State, **kwargs) -> bool:
             QuantizationMixin.initialize_quantization(self, state.model)
 
         # prepare module names
-        self._module_names = {
-            m: name
-            for name, m in match_named_modules(state.model, self.targets, self.ignore)
-        }
         self._add_temporary_names(state.model)
         # freeze all model parameters
         for _, param in state.model.named_parameters():
@@ -142,7 +136,9 @@ def start_calibration(self, model: torch.nn.Module):
 
         :param model: model to prepare for calibration
         """
-        untie_if_target_shared_embedding(model, self._module_names.values())
+        targets = match_named_modules(model, self.targets, self.ignore)
+        if targets_embeddings(model, targets):
+            untie_word_embeddings(model)
 
         for _, module in match_named_modules(model, self.targets, self.ignore):
             # Note: No need to register observers for auto-round
@@ -223,6 +219,7 @@ def apply_autoround(self, state, subgraph):
                 scheme=ar_quant_scheme,
                 iters=self.iters,
                 enable_torch_compile=self.enable_torch_compile,
+                batch_size=self.batch_size,
             )
             # TODO: configure layer-wise config based on self.resolved_config
             ar.configure_layer_config(enable_gguf_official_mixed=False)
@@ -236,7 +233,7 @@ def apply_autoround(self, state, subgraph):
                 block=decoding_layer,
                 inputs=cur_inputs,
                 q_input=self._q_input,
-                device=device,
+                device=str(device),
                 # Leave offload for LLMC
                 auto_offload=False,
             )
 
@@ -166,6 +166,7 @@ class AWQMapping:
     "Llama4ForConditionalGeneration": _default_mappings,
     "Mistral3ForConditionalGeneration": _default_mappings,
     "MistralForCausalLM": _default_mappings,
+    "Olmo3ForCausalLM": _exaone4_mappings,
     "Phi3ForCausalLM": _phi_mappings,
     "Phi3VForCausalLM": _phi_mappings,
     "Qwen2ForCausalLM": _default_mappings,
 
@@ -34,9 +34,7 @@
     reset_quantization_status,
 )
 from llmcompressor.modifiers.utils.hooks import HooksMixin
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_if_target_shared_embedding,
-)
+from llmcompressor.utils import targets_embeddings, untie_word_embeddings
 
 __all__ = ["QuantizationMixin"]
 
@@ -182,11 +180,9 @@ def start_calibration(self, model: torch.nn.Module):
 
         :param model: model to prepare for calibration
         """
-
-        matched_module_generator = (
-            x[1] for x in match_named_modules(model, self.resolved_targets, self.ignore)
-        )
-        untie_if_target_shared_embedding(model, matched_module_generator)
+        targets = match_named_modules(model, self.resolved_targets, self.ignore)
+        if targets_embeddings(model, targets):
+            untie_word_embeddings(model)
 
         for _, module in match_named_modules(model, self.resolved_targets, self.ignore):
             self._initialize_observers(module)
 
@@ -12,9 +12,8 @@
 
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_if_target_shared_embedding,
-)
+from llmcompressor.typing import NamedModules
+from llmcompressor.utils import targets_embeddings, untie_word_embeddings
 
 __all__ = ["QuIPModifier"]
 
@@ -102,18 +101,13 @@ def on_initialize(self, state: State, **kwargs) -> bool:
 
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
-
-        def matched_module_generator():
-            for scheme in self.transform_config.config_groups.values():
-                for arg in scheme.apply:
-                    gen = match_named_modules(state.model, arg.targets, arg.ignore)
-                    for _, module in gen:
-                        yield module
+        model = state.model
 
         # Untie embeddings if they will be targeted by transforms
-        untie_if_target_shared_embedding(state.model, matched_module_generator())
+        if targets_embeddings(model, self._get_targets(model)):
+            untie_word_embeddings(model)
 
-        apply_transform_config(state.model, self.transform_config)
+        apply_transform_config(model, self.transform_config)
 
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
@@ -136,6 +130,17 @@ def on_finalize(self, state: State, **kwargs) -> bool:
 
         return True
 
+    def _get_targets(self, model: torch.nn.Module) -> NamedModules:
+        if not self.initialized_:
+            raise ValueError("Cannot get targets before modifier has been initialized")
+
+        return [
+            (name, module)
+            for scheme in self.transform_config.config_groups.values()
+            for arg in scheme.apply
+            for name, module in match_named_modules(model, arg.targets, arg.ignore)
+        ]
+
     def _create_config(self) -> TransformConfig:
         config_groups = dict()
         if "v" in self.rotations:
 
@@ -16,9 +16,8 @@
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modeling import center_embeddings, fuse_norm_linears
 from llmcompressor.modifiers import Modifier
-from llmcompressor.transformers.compression.compressed_tensors_utils import (
-    untie_word_embeddings,
-)
+from llmcompressor.typing import NamedModules
+from llmcompressor.utils import untie_word_embeddings
 
 from .mappings import SpinQuantMapping, infer_mapping_from_model
 from .norm_mappings import NormMapping, infer_norm_mapping_from_model
@@ -151,14 +150,16 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     @torch.no_grad()
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
+        model = state.model
+
+        # untie embeddings to avoid unintended effects of `_center_embeddings`
+        untie_word_embeddings(model)
 
-        # needed any time embeddings/lm_head is modified
-        untie_word_embeddings(state.model)
         # needs to happen after the model has been hooked to execute on the GPU
         # otherwise we're applying weight transforms on CPU
-        self._center_embeddings(state.model)
-        self._fuse_norms(state.model)
-        apply_transform_config(state.model, self.transform_config)
+        self._center_embeddings(model)
+        self._fuse_norms(model)
+        apply_transform_config(model, self.transform_config)
 
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
@@ -181,6 +182,17 @@ def on_finalize(self, state: State, **kwargs) -> bool:
 
         return True
 
+    def _get_targets(self, model: torch.nn.Module) -> NamedModules:
+        if not self.initialized_:
+            raise ValueError("Cannot get targets before modifier has been initialized")
+
+        return [
+            (name, module)
+            for scheme in self.transform_config.config_groups.values()
+            for arg in scheme.apply
+            for name, module in match_named_modules(model, arg.targets, arg.ignore)
+        ]
+
     def _center_embeddings(self, model: PreTrainedModel):
         for _, embedding in match_named_modules(
             model, [self.mappings.embedding], warn_on_fail=True
 
@@ -144,7 +144,7 @@ def load_safetensors_state_dict(file_path: str) -> Dict[str, torch.Tensor]:
 def copy_python_files_from_model_cache(model, save_path: str):
     config = model.config
     cache_path = None
-    if hasattr(config, "_name_or_path"):
+    if hasattr(config, "_name_or_path") and len(config._name_or_path.strip()) > 0:
         import os
         import shutil
 
 
@@ -1,16 +1,12 @@
 import os
 import weakref
-from collections.abc import Generator
 from functools import wraps
 
 import torch
 from accelerate.accelerator import get_state_dict_offloaded_model
 from compressed_tensors import (
     ModelCompressor,
     SparsityCompressionConfig,
-    delete_offload_parameter,
-    has_offloaded_params,
-    register_offload_parameter,
 )
 from compressed_tensors.config import CompressionFormat
 from loguru import logger
@@ -24,7 +20,7 @@
 from llmcompressor.transformers.utils import RECIPE_FILE_NAME
 from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path
 
-__all__ = ["modify_save_pretrained", "untie_word_embeddings"]
+__all__ = ["modify_save_pretrained"]
 
 
 def modify_save_pretrained(model: PreTrainedModel):
@@ -117,119 +113,6 @@ def save_pretrained_wrapper(
         model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
 
 
-def untie_word_embeddings(model: PreTrainedModel):
-    """
-    Patches bug where HF transformers will fail to untie weights under specific
-    circumstances (https://github.com/huggingface/transformers/issues/33689).
-
-    This function detects those cases and unties the tensors if applicable
-
-    :param model: model to fix
-    """
-    try:
-        input_embed = model.get_input_embeddings()
-        output_embed = model.get_output_embeddings()
-    except NotImplementedError as e:
-        logger.warning(
-            f"cannot untie model of type {model.__class__} which doesn't have "
-            f"get_input_embeddings and get_output_embeddings implmented\n{e}"
-        )
-        return
-
-    for module in (input_embed, output_embed):
-        if module is None or not hasattr(module, "weight"):
-            logger.warning(f"Cannot untie {module} which does not have weight param")
-            continue
-
-        # this could be replaced by a `get_offloaded_parameter` util
-        if not has_offloaded_params(module):
-            untied_data = module.weight.data.clone()
-        else:
-            untied_data = module._hf_hook.weights_map["weight"].clone()
-
-        requires_grad = module.weight.requires_grad
-        new_parameter = torch.nn.Parameter(untied_data, requires_grad=requires_grad)
-        delete_offload_parameter(module, "weight")
-        register_offload_parameter(module, "weight", new_parameter)
-
-    if hasattr(model.config, "tie_word_embeddings"):
-        model.config.tie_word_embeddings = False
-
-
-def _get_embeddings_or_warn(
-    model: torch.nn.Module,
-) -> tuple[torch.nn.Module | None, torch.nn.Module | None]:
-    if not (
-        hasattr(model, "get_input_embeddings")
-        and hasattr(model, "get_output_embeddings")
-    ):
-        logger.warning(
-            f"{model.__class__} doesn't have attribute get_input_embeddings and"
-            " get_output_embeddings implemented."
-            "\nThis can cause"
-            " problems when quantizing layers with shared weights"
-        )
-        return None, None
-
-    try:
-        input_embeddings, output_embeddings = (
-            model.get_input_embeddings(),
-            model.get_output_embeddings(),
-        )
-    except NotImplementedError as e:
-        logger.warning(
-            f"{model.__class__} doesn't have get_input_embeddings and "
-            "get_output_embeddings implemented."
-            "\nThis can cause"
-            " problems when quantizing layers with shared weights"
-            f"\n{e}"
-        )
-        return None, None
-
-    if not (
-        isinstance(input_embeddings, torch.nn.Module)
-        and isinstance(output_embeddings, torch.nn.Module)
-    ):
-        logger.warning(
-            f"expected modules from {model.__class__} get_input_embeddings and"
-            f" get_output_embeddings but got {type(input_embeddings)}"
-            f"  and {type(output_embeddings)}."
-            "\nThis can cause"
-            " problems when quantizing layers with shared weights"
-        )
-        return None, None
-    return input_embeddings, output_embeddings
-
-
-def untie_if_target_shared_embedding(
-    model: torch.nn.Module, matched_module_generator: Generator[torch.nn.Module]
-):
-    """
-    Helper method that checks for shared input/output embedding and unties them
-    if either shows up in the matched_module_generator
-
-    :param model: model to untie if embeddings are shared and targeted by
-        matched_module_generator
-    :param matched_module_generator: Generator of all modules (not names) which
-            will be modified by quantization or transformation
-    """
-    input_embeddings, output_embeddings = _get_embeddings_or_warn(model)
-
-    if None in (input_embeddings, output_embeddings):  # if couldn't find embeddings
-        return
-
-    if (
-        input_embeddings.weight is not output_embeddings.weight
-    ):  # if not shared, can ignore
-        return
-
-    # if shared, check if either is targeted
-    for module in matched_module_generator:
-        if module in (input_embeddings, output_embeddings):
-            untie_word_embeddings(model)
-            return
-
-
 def get_model_compressor(
     model: torch.nn.Module,
     sparsity_config: SparsityCompressionConfig | None = None,
 
@@ -57,7 +57,12 @@ def infer_recipe_from_model_path(model_path: str | Path) -> str | None:
         - Hugging face model ID
     :return: The path to the recipe file if found, None otherwise.
     """
-    model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path
+    model_path = (
+        model_path.as_posix() if isinstance(model_path, Path) else model_path.strip()
+    )
+    if model_path == "":
+        logger.debug("got path_or_name=<empty string>" "unable to find recipe")
+        return None
 
     if os.path.isdir(model_path) or os.path.isfile(model_path):
         # Model path is a local path to the model directory or file
Original file line number	Diff line number	Diff line change
`@@ -29,12 +29,12 @@`
`29`	`29`	`from llmcompressor.pytorch.model_load.helpers import parse_dtype`
`30`	`30`	`from llmcompressor.transformers.compression.compressed_tensors_utils import (`
`31`	`31`	`modify_save_pretrained,`
`32`		`- untie_word_embeddings,`
`33`	`32`	`)`
`34`	`33`	`from llmcompressor.transformers.utils.helpers import (`
`35`	`34`	`is_model_ct_quantized_from_path,`
`36`	`35`	`)`
`37`	`36`	`from llmcompressor.typing import Processor`
	`37`	`+from llmcompressor.utils import untie_word_embeddings`
`38`	`38`	`from llmcompressor.utils.fsdp.helpers import is_fsdp_model`
`39`	`39`
`40`	`40`