Skip to content
6 changes: 5 additions & 1 deletion auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def __init__(
config: Optional[AutoConfig] = None
try:
config = AutoConfig.from_pretrained(model, trust_remote_code=self.trust_remote_code)
except (OSError, EnvironmentError) as e:
except (OSError, EnvironmentError, ValueError) as e:
logger.debug(
"Failed to load config via AutoConfig.from_pretrained for %s: %s. "
"Proceeding without config-based checks.",
Expand Down Expand Up @@ -399,6 +399,10 @@ def __init__(
torch.use_deterministic_algorithms(True, warn_only=True)

self.to_quant_block_names = to_quant_block_names
if self.to_quant_block_names is None:
_hint = getattr(model, "_autoround_to_quant_block_names", None)
if _hint is not None:
self.to_quant_block_names = _hint
if not hasattr(self, "quant_block_list"):
all_blocks = get_block_names(model)
self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names)
Expand Down
5 changes: 4 additions & 1 deletion auto_round/modeling/unfused_moe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,10 @@ def get_file_path_via_model_name(model_or_path: str, file_name):

def pre_check_config(model_name: str | torch.nn.Module, trust_remote_code: bool = True):
if isinstance(model_name, str):
config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
try:
config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
except (OSError, EnvironmentError, ValueError):
return False
elif isinstance(model_name, torch.nn.Module):
config = getattr(model_name, "config", None)
if config is None:
Expand Down
53 changes: 53 additions & 0 deletions auto_round/special_model_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"qwen2_5_omni",
"qwen3_omni_moe",
"gemma3",
"bagel",
]

NOT_SUPPORT_ONLY_TEXT_MODELS = ["mllama", "mistral3_2"]
Expand Down Expand Up @@ -198,11 +199,35 @@ def _get_glm_image_multimodal_block(model, quant_vision=False):
return block_names


def _get_bagel_multimodal_block(model, quant_vision=False):
"""Get block names for BAGEL not (Mixture of Transformers) model.

BAGEL model structure:
- language_model.model.layers: Qwen2-based LLM with not dual paths
- vit_model: SigLIP vision encoder (not quantized by default)
- connector: Vision-language MLP connector
- encoder/decoder: VAE autoencoder
- time_embedder, vae2llm, llm2vae: bridge modules

By default, only the language_model layers are quantized.
"""
block_names = []

if hasattr(model, "language_model") and hasattr(model.language_model, "model"):
if hasattr(model.language_model.model, "layers"):
block_names.append(
[f"language_model.model.layers.{i}" for i in range(len(model.language_model.model.layers))]
)

return block_names


SPECIAL_MULTIMODAL_BLOCK = {
"deepseek_vl_v2": _get_deepseek_vl2_multimodal_block,
"qwen2_5_omni": _get_qwen2_5_omni_multimodal_block,
"qwen3_omni_moe": _get_qwen3_omni_moe_multimodal_block,
"glm_image": _get_glm_image_multimodal_block,
"bagel": _get_bagel_multimodal_block,
}


Expand Down Expand Up @@ -575,6 +600,34 @@ def get_glm_flash_ignore_layers(model) -> list[str]:
)


def get_bagel_ignore_layers(model) -> list[str]:
"""Keep BAGEL generation-path modules in FP16.

BAGEL uses `*_moe_gen` modules for the image-generation path. Quantizing
them causes quality to collapse during the iterative denoising loop.
The shared attention projections are also highly sensitive.
"""
ignore_layers = [
"moe_gen",
"self_attn.q_proj",
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.o_proj",
]

return ignore_layers


register_ignore_layers(
matchers=[
ModelTypeMatcher(r"bagel", mode="full"),
],
ignore_layers=[
get_bagel_ignore_layers,
],
)


def get_predefined_ignore_layers(model: torch.nn.Module) -> list[str]:
layers = []
for rule in _PRE_DEFINED_IGNORE_LAYERS:
Expand Down
Loading
Loading