From 8e4d261b08eb3a67cab1f7ab5ae8b1fec3ce816f Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Fri, 27 Mar 2026 20:37:12 +0800 Subject: [PATCH 1/6] Support BAGEL quantization Signed-off-by: lvliang-intel --- auto_round/compressors/base.py | 6 +- auto_round/modeling/unfused_moe/__init__.py | 5 +- auto_round/special_model_handler.py | 65 ++++ auto_round/utils/bagel_loader.py | 361 ++++++++++++++++++++ auto_round/utils/model.py | 48 +++ 5 files changed, 483 insertions(+), 2 deletions(-) create mode 100644 auto_round/utils/bagel_loader.py diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index eb7f6ffcc..6ab427033 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -283,7 +283,7 @@ def __init__( config: Optional[AutoConfig] = None try: config = AutoConfig.from_pretrained(model, trust_remote_code=self.trust_remote_code) - except (OSError, EnvironmentError) as e: + except (OSError, EnvironmentError, ValueError) as e: logger.debug( "Failed to load config via AutoConfig.from_pretrained for %s: %s. " "Proceeding without config-based checks.", @@ -397,6 +397,10 @@ def __init__( torch.use_deterministic_algorithms(True, warn_only=True) self.to_quant_block_names = to_quant_block_names + if self.to_quant_block_names is None: + _hint = getattr(model, "_autoround_to_quant_block_names", None) + if _hint is not None: + self.to_quant_block_names = _hint if not hasattr(self, "quant_block_list"): all_blocks = get_block_names(model) self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names) diff --git a/auto_round/modeling/unfused_moe/__init__.py b/auto_round/modeling/unfused_moe/__init__.py index 3b9511731..a112a2a15 100644 --- a/auto_round/modeling/unfused_moe/__init__.py +++ b/auto_round/modeling/unfused_moe/__init__.py @@ -145,7 +145,10 @@ def get_file_path_via_model_name(model_or_path: str, file_name): def pre_check_config(model_name: str | torch.nn.Module, trust_remote_code: bool = True): if isinstance(model_name, str): - config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) + try: + config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) + except (OSError, EnvironmentError, ValueError): + return False elif isinstance(model_name, torch.nn.Module): config = getattr(model_name, "config", None) if config is None: diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index b6343e048..4fa9bd54c 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -48,6 +48,7 @@ "qwen2_5_omni", "qwen3_omni_moe", "gemma3", + "bagel", ] NOT_SUPPORT_ONLY_TEXT_MODELS = ["mllama", "mistral3_2"] @@ -198,11 +199,35 @@ def _get_glm_image_multimodal_block(model, quant_vision=False): return block_names +def _get_bagel_multimodal_block(model, quant_vision=False): + """Get block names for BAGEL MoT (Mixture of Transformers) model. + + BAGEL model structure: + - language_model.model.layers: Qwen2-based LLM with MoT dual paths + - vit_model: SigLIP vision encoder (not quantized by default) + - connector: Vision-language MLP connector + - encoder/decoder: VAE autoencoder + - time_embedder, vae2llm, llm2vae: bridge modules + + By default, only the language_model layers are quantized. + """ + block_names = [] + + if hasattr(model, "language_model") and hasattr(model.language_model, "model"): + if hasattr(model.language_model.model, "layers"): + block_names.append( + [f"language_model.model.layers.{i}" for i in range(len(model.language_model.model.layers))] + ) + + return block_names + + SPECIAL_MULTIMODAL_BLOCK = { "deepseek_vl_v2": _get_deepseek_vl2_multimodal_block, "qwen2_5_omni": _get_qwen2_5_omni_multimodal_block, "qwen3_omni_moe": _get_qwen3_omni_moe_multimodal_block, "glm_image": _get_glm_image_multimodal_block, + "bagel": _get_bagel_multimodal_block, } @@ -575,6 +600,46 @@ def get_glm_flash_ignore_layers(model) -> list[str]: ) +def get_bagel_ignore_layers(model) -> list[str]: + """Keep BAGEL generation-path modules in FP16. + + BAGEL uses `*_moe_gen` modules for the image-generation path. Quantizing + them causes quality to collapse during the iterative denoising loop. + The shared attention projections are also highly sensitive, and preserving + the top 4 transformer blocks in FP16 gave acceptable image quality in + validation runs. + """ + top_fp16_layers = 0 + + ignore_layers = [ + "moe_gen", + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + ] + + num_layers = 0 + if hasattr(model, "language_model") and hasattr(model.language_model, "model"): + num_layers = len(getattr(model.language_model.model, "layers", [])) + + if num_layers > 0: + for layer_idx in range(max(0, num_layers - top_fp16_layers), num_layers): + ignore_layers.append(f"language_model.model.layers.{layer_idx}") + + return ignore_layers + + +register_ignore_layers( + matchers=[ + ModelTypeMatcher(r"bagel", mode="full"), + ], + ignore_layers=[ + get_bagel_ignore_layers, + ], +) + + def get_predefined_ignore_layers(model: torch.nn.Module) -> list[str]: layers = [] for rule in _PRE_DEFINED_IGNORE_LAYERS: diff --git a/auto_round/utils/bagel_loader.py b/auto_round/utils/bagel_loader.py new file mode 100644 index 000000000..506a10263 --- /dev/null +++ b/auto_round/utils/bagel_loader.py @@ -0,0 +1,361 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Loader for BAGEL-7B-MoT (ByteDance-Seed/BAGEL-7B-MoT) model. + +BAGEL uses a Qwen2-based LLM with MoT (Mixture of Transformers) extensions. +Since transformers doesn't natively support the 'bagel' model_type, we construct +the model manually by: + 1. Building a standard Qwen2ForCausalLM from the llm_config + 2. Adding MoT generation-path modules (mlp_moe_gen, *_moe_gen projections) + 3. Loading all weights from safetensors + 4. Wrapping in BagelForQuantization for auto_round compatibility +""" + +import glob +import json +import os + +import torch +import torch.nn as nn +from transformers import AutoTokenizer, PretrainedConfig +from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM, Qwen2MLP, Qwen2RMSNorm + +from auto_round.logger import logger + + +class BagelConfig(PretrainedConfig): + """Configuration for the BAGEL model wrapper.""" + + model_type = "bagel" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +def _add_mot_extensions(language_model, llm_config): + """Add MoT (Mixture of Transformers) generation-path modules to a Qwen2 model. + + Each transformer layer gets additional modules for the generation path: + - Attention: q_proj_moe_gen, k_proj_moe_gen, v_proj_moe_gen, o_proj_moe_gen + - Attention norms: q_norm_moe_gen, k_norm_moe_gen (when qk_norm is used) + - MLP: mlp_moe_gen (full MLP duplicate) + - LayerNorms: input_layernorm_moe_gen, post_attention_layernorm_moe_gen + """ + hidden_size = llm_config.hidden_size + num_heads = llm_config.num_attention_heads + num_kv_heads = getattr(llm_config, "num_key_value_heads", num_heads) + head_dim = hidden_size // num_heads + rms_norm_eps = llm_config.rms_norm_eps + use_qk_norm = getattr(llm_config, "qk_norm", False) + + for layer in language_model.model.layers: + attn = layer.self_attn + + # QK norms for the understanding path (not present in standard Qwen2Attention) + if use_qk_norm and not hasattr(attn, "q_norm"): + attn.q_norm = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) + attn.k_norm = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) + + # MoT attention projections for generation path + attn.q_proj_moe_gen = nn.Linear(hidden_size, num_heads * head_dim, bias=True) + attn.k_proj_moe_gen = nn.Linear(hidden_size, num_kv_heads * head_dim, bias=True) + attn.v_proj_moe_gen = nn.Linear(hidden_size, num_kv_heads * head_dim, bias=True) + attn.o_proj_moe_gen = nn.Linear(num_heads * head_dim, hidden_size, bias=False) + + # MoT QK norms for generation path + if use_qk_norm: + attn.q_norm_moe_gen = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) + attn.k_norm_moe_gen = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) + else: + attn.q_norm_moe_gen = nn.Identity() + attn.k_norm_moe_gen = nn.Identity() + + # MoT MLP for generation path (duplicate of understanding MLP) + layer.mlp_moe_gen = Qwen2MLP(llm_config) + + # MoT LayerNorms for generation path + layer.input_layernorm_moe_gen = Qwen2RMSNorm(hidden_size, eps=rms_norm_eps) + layer.post_attention_layernorm_moe_gen = Qwen2RMSNorm(hidden_size, eps=rms_norm_eps) + + +def _build_module_from_weights(weight_dict): + """Build an nn.Module hierarchy from a flat weight dictionary. + + This creates a generic module tree that holds parameters but doesn't + define forward operations. Used for non-text modules (VAE, ViT, etc.) + that need to be saved but not executed during quantization. + """ + root = nn.Module() + + # Group weights by first path component + children = {} + for name, tensor in weight_dict.items(): + parts = name.split(".", 1) + if len(parts) == 1: + root.register_parameter(parts[0], nn.Parameter(tensor, requires_grad=False)) + else: + child_name = parts[0] + if child_name not in children: + children[child_name] = {} + children[child_name][parts[1]] = tensor + + for child_name, child_weights in children.items(): + child_module = _build_module_from_weights(child_weights) + root.add_module(child_name, child_module) + + return root + + +def _load_safetensors_weights(model_path): + """Load all weights from safetensors files in the model directory. + + BAGEL stores all weights across ae.safetensors (VAE) and ema.safetensors + (LLM + other modules), referenced by model.safetensors.index.json. + """ + from safetensors.torch import load_file + + all_weights = {} + + index_path = os.path.join(model_path, "model.safetensors.index.json") + if os.path.exists(index_path): + with open(index_path, "r", encoding="utf-8") as f: + index = json.load(f) + + weight_map = index.get("weight_map", {}) + + # Determine which shard files contain non-VAE weights + # VAE weights: decoder.*, encoder.* (in ae.safetensors) + lm_shard_files = set() + vae_only_files = set() + for weight_name, shard_file in weight_map.items(): + if weight_name.startswith(("decoder.", "encoder.")): + vae_only_files.add(shard_file) + else: + lm_shard_files.add(shard_file) + + # Load all shard files that contain non-VAE weights + loaded_files = set() + for shard_file in lm_shard_files: + if shard_file in loaded_files: + continue + sf_path = os.path.join(model_path, shard_file) + if os.path.exists(sf_path): + weights = load_file(sf_path, device="cpu") + # Only keep non-VAE weights from this file + for name, tensor in weights.items(): + if not name.startswith(("decoder.", "encoder.")): + all_weights[name] = tensor + loaded_files.add(shard_file) + else: + # Fallback: load all safetensors files except ae.safetensors + for sf_file in sorted(glob.glob(os.path.join(model_path, "*.safetensors"))): + basename = os.path.basename(sf_file) + if basename == "ae.safetensors": + continue + weights = load_file(sf_file, device="cpu") + for name, tensor in weights.items(): + if not name.startswith(("decoder.", "encoder.")): + all_weights[name] = tensor + + return all_weights + + +class BagelForQuantization(nn.Module): + """Wrapper for BAGEL model that's compatible with auto_round quantization. + + Contains the language_model (Qwen2+MoT) as the primary quantization target, + plus non-text modules (connector, vit, etc.) stored as generic parameter holders. + + The forward() delegates to language_model for text-only calibration. + """ + + def __init__(self, config, language_model, source_model_path=None): + super().__init__() + self.config = config + self.language_model = language_model + self._source_model_path = source_model_path + + def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs): + """Forward pass delegating to the language_model for text-only calibration.""" + return self.language_model( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + use_cache=kwargs.get("use_cache", False), + ) + + @property + def dtype(self): + return next(self.parameters()).dtype + + @property + def device(self): + return next(self.parameters()).device + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def save_pretrained(self, output_dir, **kwargs): + """Save the model in a format compatible with vllm-omni's BagelPipeline. + + Saves: + - config.json: Original BAGEL config with quantization_config added + - model weights: All parameters as safetensors + + Note: Auxiliary files (llm_config.json, vit_config.json, + preprocessor_config.json) and VAE tensors (encoder/decoder) are handled + by auto_round's _copy_extra_model_files and copy_missing_tensors_from_source. + """ + from safetensors.torch import save_file + + os.makedirs(output_dir, exist_ok=True) + + # Save config.json with the quantization_config + config_dict = self.config.to_dict() + # Remove internal PretrainedConfig fields + for key in list(config_dict.keys()): + if key.startswith("_"): + del config_dict[key] + config_dict["architectures"] = ["BagelForConditionalGeneration"] + config_dict["model_type"] = "bagel" + config_path = os.path.join(output_dir, "config.json") + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config_dict, f, indent=2, ensure_ascii=False) + + # Save all model parameters as safetensors + state_dict = {} + for name, param in self.named_parameters(): + state_dict[name] = param.data.contiguous() + + # Remap weight names to match original BAGEL checkpoint format + # The BagelPipeline expects top-level names like: + # language_model.model.layers.0.self_attn.q_proj.weight + # connector.fc1.weight + # vit_model.vision_model.embeddings... + # encoder.*, decoder.* (VAE, but those are in ae.safetensors) + save_file(state_dict, os.path.join(output_dir, "model.safetensors")) + + +def load_bagel_model(model_path, torch_dtype="auto", device_map=None): + """Load a BAGEL model for quantization. + + Args: + model_path: Path to the BAGEL model directory. + torch_dtype: Data type for model weights. + device_map: Device map for model placement. + + Returns: + Tuple of (model, tokenizer). + """ + # Load configs + config_path = os.path.join(model_path, "config.json") + with open(config_path, "r", encoding="utf-8") as f: + bagel_config_dict = json.load(f) + + llm_config_dict = bagel_config_dict.get("llm_config", {}) + + # Check for separate llm_config.json + llm_config_path = os.path.join(model_path, "llm_config.json") + if os.path.exists(llm_config_path): + with open(llm_config_path, "r", encoding="utf-8") as f: + llm_config_dict = json.load(f) + + from transformers import Qwen2Config + + llm_config = Qwen2Config(**llm_config_dict) + # BAGEL always uses qk_norm + llm_config.qk_norm = True + + # Determine torch_dtype + if torch_dtype == "auto": + model_dtype_str = bagel_config_dict.get("torch_dtype", "bfloat16") + if model_dtype_str == "bfloat16": + resolved_dtype = torch.bfloat16 + elif model_dtype_str == "float16": + resolved_dtype = torch.float16 + else: + resolved_dtype = torch.float32 + else: + resolved_dtype = torch_dtype + + logger.info("Building Qwen2ForCausalLM with MoT extensions for BAGEL...") + + # Create the language model (Qwen2 + MoT extensions) + language_model = Qwen2ForCausalLM(llm_config) + _add_mot_extensions(language_model, llm_config) + + # Load all weights + logger.info(f"Loading weights from {model_path}...") + all_weights = _load_safetensors_weights(model_path) + + # Separate language_model weights from other component weights + lm_weights = {} + other_weights = {} + for name, tensor in all_weights.items(): + if name.startswith("language_model."): + lm_name = name[len("language_model."):] + lm_weights[lm_name] = tensor + else: + other_weights[name] = tensor + + # Load language_model weights + missing, unexpected = language_model.load_state_dict(lm_weights, strict=False) + if missing: + logger.warning(f"Missing keys in language_model: {len(missing)} keys") + for k in missing[:10]: + logger.warning(f" Missing: {k}") + if unexpected: + logger.warning(f"Unexpected keys in language_model: {len(unexpected)} keys") + for k in unexpected[:10]: + logger.warning(f" Unexpected: {k}") + + # Build the BAGEL config + bagel_config = BagelConfig(**{k: v for k, v in bagel_config_dict.items() + if k not in ("llm_config", "architectures")}) + bagel_config.llm_config = llm_config.to_dict() + bagel_config.architectures = ["BagelForConditionalGeneration"] + + # Create the wrapper model + model = BagelForQuantization(bagel_config, language_model, source_model_path=model_path) + + # Add non-text modules as parameter holders + # These won't be quantized but will be saved with the model + if other_weights: + non_text_module = _build_module_from_weights(other_weights) + for child_name, child_module in non_text_module.named_children(): + if not hasattr(model, child_name): + model.add_module(child_name, child_module) + # Also add direct parameters + for param_name, param in non_text_module.named_parameters(recurse=False): + if not hasattr(model, param_name): + model.register_parameter(param_name, param) + + # Convert to target dtype + model = model.to(resolved_dtype) + model.eval() + + # Set name_or_path for auto_round compatibility + model.name_or_path = model_path + model.config._name_or_path = model_path + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + logger.info( + f"BAGEL model loaded: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters, " + f"language_model has {llm_config.num_hidden_layers} layers" + ) + + return model, tokenizer diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f0aec180a..fc84d973a 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -331,6 +331,24 @@ def llm_load_model( load_kwargs["quantization_config"] = Mxfp4Config(dequantized=True) logger.info("Detected MXFP4 quantized model, using Mxfp4Config(dequantized=True) for loading.") + # BAGEL requires a custom loader (Qwen2 + MoT extensions, not in transformers) + _config_path = os.path.join(pretrained_model_name_or_path, "config.json") if os.path.isdir( + pretrained_model_name_or_path + ) else None + if _config_path and os.path.exists(_config_path): + with open(_config_path) as _f: + _mt = json.load(_f).get("model_type") + if _mt == "bagel": + from auto_round.utils.bagel_loader import load_bagel_model + + model, tokenizer = load_bagel_model( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + ) + model = _to_model_dtype(model, model_dtype) + model._autoround_to_quant_block_names = "language_model.model.layers" + return model, tokenizer + is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower())) tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) @@ -537,6 +555,15 @@ def mllm_load_model( torch_dtype=torch_dtype, device_map="auto" if use_auto_mapping else None, ) + elif "bagel" == model_type: + from auto_round.utils.bagel_loader import load_bagel_model + + model, tokenizer = load_bagel_model( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + ) + processor = None + image_processor = None else: architectures = config["architectures"][0] if architectures == "LlavaLlamaForCausalLM": @@ -750,10 +777,28 @@ def is_pure_text_model(model): return True +# Model types that have multimodal components but should use LLM compressor +# (text-only calibration, non-text modules excluded from quantization). +_LLM_ONLY_MODEL_TYPES = {"bagel"} + + def is_mllm_model(model_or_path: Union[str, torch.nn.Module], platform: str = None): from auto_round.utils.common import MM_KEYS model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path + + # Check model_type exclusion: some models have multimodal components + # but should be quantized as LLM (e.g., BAGEL MoT). + _model_type = None + if isinstance(model_or_path, torch.nn.Module) and hasattr(model_or_path, "config"): + _model_type = getattr(model_or_path.config, "model_type", None) + elif isinstance(model_path, str) and os.path.isdir(model_path): + _cfg_path = os.path.join(model_path, "config.json") + if os.path.exists(_cfg_path): + with open(_cfg_path) as _f: + _model_type = json.load(_f).get("model_type") + if _model_type in _LLM_ONLY_MODEL_TYPES: + return False # For dummy model, model_path could be "". if model_path and not os.path.isdir(model_path): model_path = download_or_get_path(model_path, platform=platform) @@ -1658,6 +1703,9 @@ def _get_reference_amax_from_experts(moe_module: torch.nn.Module, attr_name: str # the quantized output directory so that from_pretrained() works out of the box. _EXTRA_MODEL_FILES = { "spk_dict.pt", # Qwen2.5-Omni speaker dictionary for audio output + "llm_config.json", # BAGEL sub-model config + "vit_config.json", # BAGEL vision transformer config + "preprocessor_config.json", # BAGEL image preprocessor config } From 8c23b975cf6b24f0fd0ebbea065b6962f9ae7f89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 12:45:03 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/special_model_handler.py | 4 ++-- auto_round/utils/bagel_loader.py | 27 ++++++++++++++------------- auto_round/utils/model.py | 12 +++++++----- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index 4fa9bd54c..38ee761e6 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -200,10 +200,10 @@ def _get_glm_image_multimodal_block(model, quant_vision=False): def _get_bagel_multimodal_block(model, quant_vision=False): - """Get block names for BAGEL MoT (Mixture of Transformers) model. + """Get block names for BAGEL not (Mixture of Transformers) model. BAGEL model structure: - - language_model.model.layers: Qwen2-based LLM with MoT dual paths + - language_model.model.layers: Qwen2-based LLM with not dual paths - vit_model: SigLIP vision encoder (not quantized by default) - connector: Vision-language MLP connector - encoder/decoder: VAE autoencoder diff --git a/auto_round/utils/bagel_loader.py b/auto_round/utils/bagel_loader.py index 506a10263..53e080fcf 100644 --- a/auto_round/utils/bagel_loader.py +++ b/auto_round/utils/bagel_loader.py @@ -14,11 +14,11 @@ """Loader for BAGEL-7B-MoT (ByteDance-Seed/BAGEL-7B-MoT) model. -BAGEL uses a Qwen2-based LLM with MoT (Mixture of Transformers) extensions. +BAGEL uses a Qwen2-based LLM with not (Mixture of Transformers) extensions. Since transformers doesn't natively support the 'bagel' model_type, we construct the model manually by: 1. Building a standard Qwen2ForCausalLM from the llm_config - 2. Adding MoT generation-path modules (mlp_moe_gen, *_moe_gen projections) + 2. Adding not generation-path modules (mlp_moe_gen, *_moe_gen projections) 3. Loading all weights from safetensors 4. Wrapping in BagelForQuantization for auto_round compatibility """ @@ -45,7 +45,7 @@ def __init__(self, **kwargs): def _add_mot_extensions(language_model, llm_config): - """Add MoT (Mixture of Transformers) generation-path modules to a Qwen2 model. + """Add not (Mixture of Transformers) generation-path modules to a Qwen2 model. Each transformer layer gets additional modules for the generation path: - Attention: q_proj_moe_gen, k_proj_moe_gen, v_proj_moe_gen, o_proj_moe_gen @@ -68,13 +68,13 @@ def _add_mot_extensions(language_model, llm_config): attn.q_norm = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) attn.k_norm = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) - # MoT attention projections for generation path + # not attention projections for generation path attn.q_proj_moe_gen = nn.Linear(hidden_size, num_heads * head_dim, bias=True) attn.k_proj_moe_gen = nn.Linear(hidden_size, num_kv_heads * head_dim, bias=True) attn.v_proj_moe_gen = nn.Linear(hidden_size, num_kv_heads * head_dim, bias=True) attn.o_proj_moe_gen = nn.Linear(num_heads * head_dim, hidden_size, bias=False) - # MoT QK norms for generation path + # not QK norms for generation path if use_qk_norm: attn.q_norm_moe_gen = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) attn.k_norm_moe_gen = Qwen2RMSNorm(head_dim, eps=rms_norm_eps) @@ -82,10 +82,10 @@ def _add_mot_extensions(language_model, llm_config): attn.q_norm_moe_gen = nn.Identity() attn.k_norm_moe_gen = nn.Identity() - # MoT MLP for generation path (duplicate of understanding MLP) + # not MLP for generation path (duplicate of understanding MLP) layer.mlp_moe_gen = Qwen2MLP(llm_config) - # MoT LayerNorms for generation path + # not LayerNorms for generation path layer.input_layernorm_moe_gen = Qwen2RMSNorm(hidden_size, eps=rms_norm_eps) layer.post_attention_layernorm_moe_gen = Qwen2RMSNorm(hidden_size, eps=rms_norm_eps) @@ -175,7 +175,7 @@ def _load_safetensors_weights(model_path): class BagelForQuantization(nn.Module): """Wrapper for BAGEL model that's compatible with auto_round quantization. - Contains the language_model (Qwen2+MoT) as the primary quantization target, + Contains the language_model (Qwen2+not) as the primary quantization target, plus non-text modules (connector, vit, etc.) stored as generic parameter holders. The forward() delegates to language_model for text-only calibration. @@ -290,9 +290,9 @@ def load_bagel_model(model_path, torch_dtype="auto", device_map=None): else: resolved_dtype = torch_dtype - logger.info("Building Qwen2ForCausalLM with MoT extensions for BAGEL...") + logger.info("Building Qwen2ForCausalLM with not extensions for BAGEL...") - # Create the language model (Qwen2 + MoT extensions) + # Create the language model (Qwen2 + not extensions) language_model = Qwen2ForCausalLM(llm_config) _add_mot_extensions(language_model, llm_config) @@ -305,7 +305,7 @@ def load_bagel_model(model_path, torch_dtype="auto", device_map=None): other_weights = {} for name, tensor in all_weights.items(): if name.startswith("language_model."): - lm_name = name[len("language_model."):] + lm_name = name[len("language_model.") :] lm_weights[lm_name] = tensor else: other_weights[name] = tensor @@ -322,8 +322,9 @@ def load_bagel_model(model_path, torch_dtype="auto", device_map=None): logger.warning(f" Unexpected: {k}") # Build the BAGEL config - bagel_config = BagelConfig(**{k: v for k, v in bagel_config_dict.items() - if k not in ("llm_config", "architectures")}) + bagel_config = BagelConfig( + **{k: v for k, v in bagel_config_dict.items() if k not in ("llm_config", "architectures")} + ) bagel_config.llm_config = llm_config.to_dict() bagel_config.architectures = ["BagelForConditionalGeneration"] diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index fc84d973a..0b081b2a1 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -331,10 +331,12 @@ def llm_load_model( load_kwargs["quantization_config"] = Mxfp4Config(dequantized=True) logger.info("Detected MXFP4 quantized model, using Mxfp4Config(dequantized=True) for loading.") - # BAGEL requires a custom loader (Qwen2 + MoT extensions, not in transformers) - _config_path = os.path.join(pretrained_model_name_or_path, "config.json") if os.path.isdir( - pretrained_model_name_or_path - ) else None + # BAGEL requires a custom loader (Qwen2 + not extensions, not in transformers) + _config_path = ( + os.path.join(pretrained_model_name_or_path, "config.json") + if os.path.isdir(pretrained_model_name_or_path) + else None + ) if _config_path and os.path.exists(_config_path): with open(_config_path) as _f: _mt = json.load(_f).get("model_type") @@ -788,7 +790,7 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module], platform: str = No model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path # Check model_type exclusion: some models have multimodal components - # but should be quantized as LLM (e.g., BAGEL MoT). + # but should be quantized as LLM (e.g., BAGEL not). _model_type = None if isinstance(model_or_path, torch.nn.Module) and hasattr(model_or_path, "config"): _model_type = getattr(model_or_path.config, "model_type", None) From c1a2b94679b35904751ace1d9ee299e689660153 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Fri, 27 Mar 2026 21:05:05 +0800 Subject: [PATCH 3/6] update code Signed-off-by: lvliang-intel --- auto_round/special_model_handler.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index 4fa9bd54c..c17dca5e3 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -605,12 +605,8 @@ def get_bagel_ignore_layers(model) -> list[str]: BAGEL uses `*_moe_gen` modules for the image-generation path. Quantizing them causes quality to collapse during the iterative denoising loop. - The shared attention projections are also highly sensitive, and preserving - the top 4 transformer blocks in FP16 gave acceptable image quality in - validation runs. + The shared attention projections are also highly sensitive. """ - top_fp16_layers = 0 - ignore_layers = [ "moe_gen", "self_attn.q_proj", @@ -619,14 +615,6 @@ def get_bagel_ignore_layers(model) -> list[str]: "self_attn.o_proj", ] - num_layers = 0 - if hasattr(model, "language_model") and hasattr(model.language_model, "model"): - num_layers = len(getattr(model.language_model.model, "layers", [])) - - if num_layers > 0: - for layer_idx in range(max(0, num_layers - top_fp16_layers), num_layers): - ignore_layers.append(f"language_model.model.layers.{layer_idx}") - return ignore_layers From ad34ebb4cae8c6b5c789ffa710f5401903f1f98a Mon Sep 17 00:00:00 2001 From: Liang Lv Date: Fri, 27 Mar 2026 21:10:23 +0800 Subject: [PATCH 4/6] Update auto_round/utils/bagel_loader.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/utils/bagel_loader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/auto_round/utils/bagel_loader.py b/auto_round/utils/bagel_loader.py index 53e080fcf..cbb017b46 100644 --- a/auto_round/utils/bagel_loader.py +++ b/auto_round/utils/bagel_loader.py @@ -248,13 +248,12 @@ def save_pretrained(self, output_dir, **kwargs): save_file(state_dict, os.path.join(output_dir, "model.safetensors")) -def load_bagel_model(model_path, torch_dtype="auto", device_map=None): +def load_bagel_model(model_path, torch_dtype="auto"): """Load a BAGEL model for quantization. Args: model_path: Path to the BAGEL model directory. torch_dtype: Data type for model weights. - device_map: Device map for model placement. Returns: Tuple of (model, tokenizer). From 06ca2d30698bceef5d0a0cdc3f5795856db3615e Mon Sep 17 00:00:00 2001 From: Liang Lv Date: Fri, 27 Mar 2026 21:10:41 +0800 Subject: [PATCH 5/6] Update auto_round/utils/model.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/utils/model.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 0b081b2a1..91f4b3eca 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -560,8 +560,20 @@ def mllm_load_model( elif "bagel" == model_type: from auto_round.utils.bagel_loader import load_bagel_model + resolved_model_path = pretrained_model_name_or_path + # If a Hugging Face repo ID is provided instead of a local directory, + # download a local snapshot so that load_bagel_model can find config.json. + if not os.path.isdir(resolved_model_path): + try: + from huggingface_hub import snapshot_download # type: ignore[import] + + resolved_model_path = snapshot_download(pretrained_model_name_or_path) + except Exception: # pylint: disable=broad-except + # Fall back to the original value; load_bagel_model may still handle it + resolved_model_path = pretrained_model_name_or_path + model, tokenizer = load_bagel_model( - pretrained_model_name_or_path, + resolved_model_path, torch_dtype=torch_dtype, ) processor = None From 39f8d2cc9fe6b44b9304c5d90c73be03ce99b694 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Mar 2026 13:14:37 +0000 Subject: [PATCH 6/6] Fix save_pretrained to use state_dict() instead of named_parameters() Agent-Logs-Url: https://github.com/intel/auto-round/sessions/57e2e340-88e0-42e8-9528-a24ad1bc7d61 Co-authored-by: lvliang-intel <104267837+lvliang-intel@users.noreply.github.com> --- auto_round/utils/bagel_loader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/auto_round/utils/bagel_loader.py b/auto_round/utils/bagel_loader.py index cbb017b46..43b4cb12f 100644 --- a/auto_round/utils/bagel_loader.py +++ b/auto_round/utils/bagel_loader.py @@ -234,10 +234,10 @@ def save_pretrained(self, output_dir, **kwargs): with open(config_path, "w", encoding="utf-8") as f: json.dump(config_dict, f, indent=2, ensure_ascii=False) - # Save all model parameters as safetensors - state_dict = {} - for name, param in self.named_parameters(): - state_dict[name] = param.data.contiguous() + # Save all model weights (parameters + registered buffers) as safetensors. + # Using state_dict() instead of named_parameters() ensures buffers such as + # rotary-embedding caches are included, which are required for correct reload. + tensors = {name: tensor.contiguous() for name, tensor in self.state_dict().items()} # Remap weight names to match original BAGEL checkpoint format # The BagelPipeline expects top-level names like: @@ -245,7 +245,7 @@ def save_pretrained(self, output_dir, **kwargs): # connector.fc1.weight # vit_model.vision_model.embeddings... # encoder.*, decoder.* (VAE, but those are in ae.safetensors) - save_file(state_dict, os.path.join(output_dir, "model.safetensors")) + save_file(tensors, os.path.join(output_dir, "model.safetensors")) def load_bagel_model(model_path, torch_dtype="auto"):