Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion vllm/model_executor/layers/quantization/modelopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,24 @@ def get_quant_method(

def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
if len(self.exclude_modules) > 0:
self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
# This is a workaround for the weights remapping issue:
# https://github.com/vllm-project/vllm/issues/28072
# Right now, the Nvidia ModelOpt library use just one wildcard pattern:
# module_path*
# It gets applied if the whole tree of modules rooted at module_path
# is not quantized. Here we replace such pattern by 2 patterns that are
# collectively equivalent to the original pattern:
# module_path
# module_path.*
new_exclude_modules = []
for exclude in self.exclude_modules:
if len(exclude) >= 2 and exclude[-1] == "*" and exclude[-2] != ".":
new_exclude_modules.append(exclude[:-1])
new_exclude_modules.append(exclude[:-1] + ".*")
else:
new_exclude_modules.append(exclude)

self.exclude_modules = hf_to_vllm_mapper.apply_list(new_exclude_modules)

@staticmethod
def get_config_filenames() -> list[str]:
Expand Down
Loading