From a9257173996ae38d337e9778c1d6b88bb5319910 Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Sun, 18 Jan 2026 05:36:38 +0000 Subject: [PATCH 01/10] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=20GPTQ=20Marli?= =?UTF-8?q?n=20=E5=92=8C=20AWQ=20Marlin=20=E9=87=8F=E5=8C=96=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 主要变更: - 添加 GPTQ Marlin (W4A16) 和 AWQ Marlin (W4A16) 量化策略 - 修复 loader.py 以正确加载 gptq_marlin 格式权重(支持 Marlin 特有的 repacked qweight 和 permuted scales) - 修改 quantize_model.py 支持导出 gptq_marlin 格式(对称量化 + Marlin repack/permute) - 更新 linear.py: - 添加 _offline_quant_bits 缓冲区存储量化位数 - 添加 GPTQ runtime shuffle 支持(gptq_shuffle) - 添加 GPTQ/AWQ Marlin 的 lazy repack 支持(_maybe_prepare_offline_gptq_marlin/_awq_marlin) - 统一使用 vLLM 格式(int32 packed, fp16 scales) - 简化各策略文件,移除重复代码 - 移除旧的 AllSpark Marlin 实现文件 - 添加多个 benchmark 配置文件(GPTQ/AWQ Marlin 各 bit 版本) --- diffulex/engine/model_runner.py | 8 +- diffulex/layer/linear.py | 694 ++++++++++++++++-- diffulex/utils/loader.py | 459 +++++++++++- diffulex/utils/quantization/quantize_model.py | 403 +++++----- diffulex/utils/quantization/registry.py | 23 +- .../utils/quantization/strategies/__init__.py | 6 +- .../strategies/linear_awq_marlin_w4a16.py | 123 ++++ .../strategies/linear_awq_w4a16.py | 517 ++----------- .../strategies/linear_fp8_w8a16.py | 433 ++--------- .../strategies/linear_fp8_w8a8.py | 506 ++----------- .../strategies/linear_gptq_marlin_w4a16.py | 156 ++++ .../strategies/linear_gptq_w4a16.py | 571 +++----------- .../strategies/linear_int4_w4a16.py | 537 ++------------ .../strategies/linear_int4_w4a8.py | 478 +----------- .../strategies/linear_int8_w8a16.py | 539 +------------- .../strategies/linear_int8_w8a8.py | 493 +++---------- .../strategies/linear_marlin_int8_w8a16.py | 209 +++--- diffulex_bench/configs/awq_bf16kv_varlen.yml | 47 ++ .../configs/awq_marlin_bf16kv_varlen.yml | 48 ++ diffulex_bench/configs/fp8_bf16kv_varlen.yml | 48 ++ diffulex_bench/configs/gptq_bf16kv_varlen.yml | 47 ++ .../configs/gptq_bf16kv_varlen_tp2.yml | 47 ++ .../configs/gptq_marlin_bf16kv_varlen.yml | 48 ++ .../configs/gptq_marlin_w2_bf16kv_varlen.yml | 47 ++ .../configs/gptq_marlin_w4_bf16kv_varlen.yml | 47 ++ .../configs/gptq_marlin_w8_bf16kv_varlen.yml | 47 ++ .../configs/gptq_w2_bf16kv_varlen.yml | 47 ++ .../configs/gptq_w8_bf16kv_varlen.yml | 47 ++ diffulex_kernel/__init__.py | 60 +- .../csrc/marlin/allspark_qgemm_w8a16.cu | 542 -------------- .../csrc/marlin/allspark_repack.cu | 163 ---- .../csrc/marlin/allspark_utils.cuh | 247 ------- .../csrc/marlin/torch_bindings_marlin.cpp | 25 - diffulex_kernel/python/marlin_ops.py | 128 ---- docs/GPTQ_AWQ_SUPPORT.md | 233 ------ 35 files changed, 2720 insertions(+), 5353 deletions(-) create mode 100644 diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py create mode 100644 diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py create mode 100644 diffulex_bench/configs/awq_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/fp8_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml create mode 100644 diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml create mode 100644 diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml delete mode 100644 diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu delete mode 100644 diffulex_kernel/csrc/marlin/allspark_repack.cu delete mode 100644 diffulex_kernel/csrc/marlin/allspark_utils.cuh delete mode 100644 diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp delete mode 100644 diffulex_kernel/python/marlin_ops.py delete mode 100644 docs/GPTQ_AWQ_SUPPORT.md diff --git a/diffulex/engine/model_runner.py b/diffulex/engine/model_runner.py index aeeb442..c347fb3 100755 --- a/diffulex/engine/model_runner.py +++ b/diffulex/engine/model_runner.py @@ -36,7 +36,13 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event]): # Initialize model, sampler, and kv cache init_method = f"tcp://{config.master_addr}:{config.master_port}" dist.init_process_group("nccl", init_method, world_size=self.world_size, rank=rank, device_id=config.device_ids[rank]) - device_id = (getattr(config, "device_start", 0) or 0) + rank + config.device_ids[rank] + # Choose CUDA device for this TP rank. + # config.device_ids is already a list of logical CUDA device indices (respecting CUDA_VISIBLE_DEVICES). + # Do NOT add rank again, otherwise rank 1 with device_ids=[0,1] becomes device 2. + if getattr(config, "device_ids", None): + device_id = config.device_ids[rank] + else: + device_id = (getattr(config, "device_start", 0) or 0) + rank assert 0 <= device_id < torch.cuda.device_count(), f"Invalid device_id {device_id}." torch.cuda.set_device(device_id) default_dtype = torch.get_default_dtype() diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py index b34f017..0ba2ceb 100755 --- a/diffulex/layer/linear.py +++ b/diffulex/layer/linear.py @@ -89,20 +89,45 @@ def __init__( self.register_buffer("_weight_is_quantized", torch.tensor(False, dtype=torch.bool), persistent=False) # GPTQ/AWQ offline quantized weight storage (W4A16). - # GPTQ: qweight (packed int4), qzeros (packed int4), scales (per-group), g_idx (optional) - # AWQ: qweight (packed int4), qzeros (packed int4), scales (per-group) - self.register_buffer("gptq_qweight", torch.empty(0, dtype=torch.int8), persistent=False) - self.register_buffer("gptq_qzeros", torch.empty(0, dtype=torch.int8), persistent=False) - self.register_buffer("gptq_scales", torch.empty(0, dtype=torch.float32), persistent=False) + # NOTE(vLLM-format): + # - GPTQ: qweight int32 [K/pack, N], qzeros int32 [K/group, N/pack], + # scales fp16 [K/group, N], g_idx optional (usually empty when desc_act=False) + # - AWQ : qweight int32 [K, N/pack], qzeros int32 [K/group, N/pack], + # scales fp16 [K/group, N] + # + # Where pack = 32 / bits (bits=4 => pack=8), K=in_features, N=out_features. + self.register_buffer("gptq_qweight", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("gptq_qzeros", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("gptq_scales", torch.empty(0, dtype=torch.float16), persistent=False) self.register_buffer("gptq_g_idx", torch.empty(0, dtype=torch.int32), persistent=False) - self.register_buffer("awq_qweight", torch.empty(0, dtype=torch.int8), persistent=False) - self.register_buffer("awq_qzeros", torch.empty(0, dtype=torch.int8), persistent=False) - self.register_buffer("awq_scales", torch.empty(0, dtype=torch.float32), persistent=False) + self.register_buffer("awq_qweight", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("awq_qzeros", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("awq_scales", torch.empty(0, dtype=torch.float16), persistent=False) # Metadata for offline quantized weights self.register_buffer("_offline_quant_format", torch.empty(0, dtype=torch.int8), persistent=False) # 0=none, 1=gptq, 2=awq + # Bits for offline GPTQ/AWQ weights (needed for marlin-exported layouts where + # we cannot infer bits from packed tensor shapes). + self.register_buffer("_offline_quant_bits", torch.tensor(0, dtype=torch.int32), persistent=False) self.register_buffer("_offline_quant_group_size", torch.tensor(128, dtype=torch.int32), persistent=False) self.register_buffer("_offline_quant_out_features", torch.tensor(0, dtype=torch.int32), persistent=False) self.register_buffer("_offline_quant_in_features", torch.tensor(0, dtype=torch.int32), persistent=False) + # GPTQ runtime prep state (vLLM requires gptq_shuffle before first gemm). + self.register_buffer("_gptq_is_shuffled", torch.tensor(False, dtype=torch.bool), persistent=False) + + # ---- vLLM Marlin variants (GPTQ/AWQ) one-time repack cache ---- + # These buffers are populated lazily when a *_marlin strategy is selected. + self.register_buffer("_gptq_marlin_is_prepared", torch.tensor(False, dtype=torch.bool), persistent=False) + self.register_buffer("gptq_marlin_qweight", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("gptq_marlin_scales", torch.empty(0, dtype=torch.float16), persistent=False) + self.register_buffer("gptq_marlin_zp", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("gptq_marlin_g_idx", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("gptq_marlin_g_idx_sort_indices", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("gptq_marlin_workspace", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("_awq_marlin_is_prepared", torch.tensor(False, dtype=torch.bool), persistent=False) + self.register_buffer("awq_marlin_qweight", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("awq_marlin_scales", torch.empty(0, dtype=torch.float16), persistent=False) + self.register_buffer("awq_marlin_zp", torch.empty(0, dtype=torch.int32), persistent=False) + self.register_buffer("awq_marlin_workspace", torch.empty(0, dtype=torch.int32), persistent=False) def has_quantized_weight(self) -> bool: return bool(self._weight_is_quantized.item()) and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0 @@ -140,78 +165,434 @@ def set_offline_quantized_weight( Args: format: "gptq" or "awq" - qweight: int8 packed int4 weights [out_features, (in_features + 1) // 2] - qzeros: int8 packed int4 zeros [num_groups, (in_features + 1) // 2] - scales: float32 per-group scales [num_groups, in_features] or [num_groups] + qweight/qzeros/scales: vLLM standard tensors (see notes above). out_features: Output features (N) in_features: Input features (K) group_size: Group size for quantization (default: 128) - g_idx: Optional int32 tensor [out_features] for GPTQ group indices (GPTQ only) + g_idx: Optional int32 tensor [in_features] for act-order (GPTQ only; usually empty) """ + # NOTE: Offline quantized weights are typically loaded from safetensors on CPU. + # In Diffulex, the engine may move modules to CUDA before calling this method, + # so we must ensure tensors are moved to the module device here. + def _infer_module_device() -> torch.device: + w = getattr(self, "weight", None) + if isinstance(w, torch.Tensor): + return w.device + for p in self.parameters(recurse=False): + return p.device + for b in self.buffers(recurse=False): + return b.device + return torch.device("cpu") + + module_device = _infer_module_device() + format = format.strip().lower() if format not in ("gptq", "awq"): raise ValueError(f"Unsupported offline quant format: {format}. Supported: 'gptq', 'awq'") - if qweight.dtype != torch.int8: - raise TypeError(f"qweight must be int8, got {qweight.dtype}") - if qzeros.dtype != torch.int8: - raise TypeError(f"qzeros must be int8, got {qzeros.dtype}") - if scales.dtype != torch.float32: - scales = scales.to(dtype=torch.float32) + # Infer bits/pack_factor from packed tensor shapes to support GPTQ W2/W4/W8. + # vLLM packing convention: + # - GPTQ: qweight [K/pack, N], qzeros [K/group, N/pack] + # - AWQ: qweight [K, N/pack], qzeros [K/group, N/pack] + # where pack = 32 / bits and bits must divide 32. + if format == "gptq": + if int(qweight.shape[0]) <= 0 or in_features % int(qweight.shape[0]) != 0: + raise ValueError( + "Cannot infer GPTQ pack_factor from qweight shape: " + f"in_features={in_features}, qweight.shape={tuple(qweight.shape)}" + ) + pack_factor = in_features // int(qweight.shape[0]) + else: # awq + if int(qweight.shape[1]) <= 0 or out_features % int(qweight.shape[1]) != 0: + raise ValueError( + "Cannot infer AWQ pack_factor from qweight shape: " + f"out_features={out_features}, qweight.shape={tuple(qweight.shape)}" + ) + pack_factor = out_features // int(qweight.shape[1]) + if 32 % pack_factor != 0: + raise ValueError( + f"Unsupported pack_factor={pack_factor} (requires 32%pack_factor==0) " + f"for offline format={format}. " + f"in_features={in_features}, out_features={out_features}, " + f"qweight.shape={tuple(qweight.shape)}, qzeros.shape={tuple(qzeros.shape)}, scales.shape={tuple(scales.shape)}" + ) + bits = 32 // pack_factor + if format == "awq" and bits != 4: + raise ValueError(f"AWQ 目前仅支持 4-bit(pack_factor=8),当前推断 bits={bits} (pack_factor={pack_factor})") + # Record bits for downstream kernels (esp. marlin path). + self._offline_quant_bits = torch.tensor(bits, dtype=torch.int32, device=module_device) + + if qweight.dtype != torch.int32: + raise TypeError(f"qweight must be int32 (vLLM format), got {qweight.dtype}") + if qzeros.dtype != torch.int32: + raise TypeError(f"qzeros must be int32 (vLLM format), got {qzeros.dtype}") + if scales.dtype not in (torch.float16, torch.bfloat16, torch.float32): + raise TypeError( + f"scales must be float16/bfloat16/float32 (vLLM format), got {scales.dtype}" + ) + if scales.dtype != torch.float16: + scales = scales.to(dtype=torch.float16) + + # Move to module device before validation/assignment. + if qweight.device != module_device: + qweight = qweight.to(device=module_device) + if qzeros.device != module_device: + qzeros = qzeros.to(device=module_device) + if scales.device != module_device: + scales = scales.to(device=module_device) + if g_idx is not None and g_idx.device != module_device: + g_idx = g_idx.to(device=module_device) - num_groups = (out_features + group_size - 1) // group_size - expected_qweight_shape = (out_features, (in_features + 1) // 2) - expected_qzeros_shape = (num_groups, (in_features + 1) // 2) + # group_size == -1 means channelwise in some ecosystems; vLLM normalizes -1 to K. + group_size_norm = in_features if group_size == -1 else group_size + if group_size_norm <= 0 or (in_features % group_size_norm != 0): + raise ValueError( + f"Invalid group_size={group_size} for in_features={in_features}. " + "Expected group_size == -1 or a positive divisor of in_features." + ) + num_groups = in_features // group_size_norm + + if format == "gptq": + expected_qweight_shape = (in_features // pack_factor, out_features) + expected_qzeros_shape = (num_groups, out_features // pack_factor) + expected_scales_shape = (num_groups, out_features) + else: # awq + expected_qweight_shape = (in_features, out_features // pack_factor) + expected_qzeros_shape = (num_groups, out_features // pack_factor) + expected_scales_shape = (num_groups, out_features) if qweight.shape != expected_qweight_shape: raise ValueError( - f"qweight shape mismatch: got {qweight.shape}, expected {expected_qweight_shape}" + f"qweight shape mismatch: got {tuple(qweight.shape)}, expected {expected_qweight_shape}" ) if qzeros.shape != expected_qzeros_shape: raise ValueError( - f"qzeros shape mismatch: got {qzeros.shape}, expected {expected_qzeros_shape}" + f"qzeros shape mismatch: got {tuple(qzeros.shape)}, expected {expected_qzeros_shape}" + ) + if scales.shape != expected_scales_shape: + raise ValueError( + f"scales shape mismatch: got {tuple(scales.shape)}, expected {expected_scales_shape}" ) if format == "gptq": self.gptq_qweight = qweight self.gptq_qzeros = qzeros self.gptq_scales = scales + if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() == 0: + g_idx = None if g_idx is not None: - if g_idx.shape != (out_features,): + if g_idx.shape != (in_features,): raise ValueError( - f"g_idx shape mismatch: got {g_idx.shape}, expected ({out_features},)" + f"g_idx shape mismatch: got {g_idx.shape}, expected ({in_features},)" ) if g_idx.dtype != torch.int32: g_idx = g_idx.to(dtype=torch.int32) self.gptq_g_idx = g_idx else: # Clear g_idx if not provided - self.gptq_g_idx = torch.empty(0, dtype=torch.int32) - self._offline_quant_format = torch.tensor(1, dtype=torch.int8) + self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) + self._offline_quant_format = torch.tensor(1, dtype=torch.int8, device=module_device) + self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device) else: # AWQ self.awq_qweight = qweight self.awq_qzeros = qzeros self.awq_scales = scales # AWQ doesn't use g_idx, clear it - self.gptq_qweight = torch.empty(0, dtype=torch.int8) - self.gptq_qzeros = torch.empty(0, dtype=torch.int8) - self.gptq_scales = torch.empty(0, dtype=torch.float32) - self.gptq_g_idx = torch.empty(0, dtype=torch.int32) - self._offline_quant_format = torch.tensor(2, dtype=torch.int8) + self.gptq_qweight = torch.empty(0, dtype=torch.int32, device=module_device) + self.gptq_qzeros = torch.empty(0, dtype=torch.int32, device=module_device) + self.gptq_scales = torch.empty(0, dtype=torch.float16, device=module_device) + self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) + self._offline_quant_format = torch.tensor(2, dtype=torch.int8, device=module_device) + self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device) + + # Reset marlin-prep caches (weights may have changed / moved). + self._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device) + self.gptq_marlin_qweight = torch.empty(0, dtype=torch.int32, device=module_device) + self.gptq_marlin_scales = torch.empty(0, dtype=torch.float16, device=module_device) + self.gptq_marlin_zp = torch.empty(0, dtype=torch.int32, device=module_device) + self.gptq_marlin_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) + self.gptq_marlin_g_idx_sort_indices = torch.empty(0, dtype=torch.int32, device=module_device) + self.gptq_marlin_workspace = torch.empty(0, dtype=torch.int32, device=module_device) + self._awq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device) + self.awq_marlin_qweight = torch.empty(0, dtype=torch.int32, device=module_device) + self.awq_marlin_scales = torch.empty(0, dtype=torch.float16, device=module_device) + self.awq_marlin_zp = torch.empty(0, dtype=torch.int32, device=module_device) + self.awq_marlin_workspace = torch.empty(0, dtype=torch.int32, device=module_device) - self._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32) - self._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32) - self._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32) + self._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32, device=module_device) + self._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device) + self._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device) # Drop bf16 weight Parameter if present (to free memory) if "weight" in self._parameters: self._parameters.pop("weight", None) setattr(self, "weight", None) + def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None: + """Prepare vLLM GPTQ weights on first use (required gptq_shuffle).""" + if self._offline_quant_format.numel() == 0: + return + if int(self._offline_quant_format.item()) != 1: + return + if self.gptq_qweight.numel() == 0: + return + if self._gptq_is_shuffled.numel() > 0 and bool(self._gptq_is_shuffled.item()): + return + + # Lazy import to avoid pulling vLLM unless GPTQ offline weights are used. + try: + from vllm import _custom_ops as ops # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "GPTQ offline 权重已加载,但无法导入 vLLM CUDA custom ops(vllm._custom_ops)。" + ) from e + + # vLLM uses torch.int for g_idx (can be empty when desc_act=False). + if self.gptq_g_idx.numel() == 0: + g_idx = torch.empty((0,), device=x.device, dtype=torch.int) + else: + g_idx = self.gptq_g_idx.to(device=x.device, dtype=torch.int) + + if self.gptq_qweight.device != x.device: + raise RuntimeError( + f"GPTQ qweight device mismatch: qweight on {self.gptq_qweight.device}, x on {x.device}. " + "请确保模型与输入在同一设备。" + ) + + # Infer weight_bits from packed qweight shape to support GPTQ W2/W4/W8. + # qweight: [K/pack_factor, N], where pack_factor = 32 / weight_bits. + in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else None + if in_features is None or in_features <= 0: + raise RuntimeError("GPTQ offline 权重已加载,但无法推断 in_features 以计算 weight_bits。") + if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0: + raise RuntimeError( + f"GPTQ qweight shape 不合法,无法推断 weight_bits: " + f"in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}" + ) + pack_factor = in_features // int(self.gptq_qweight.shape[0]) + if 32 % pack_factor != 0: + raise RuntimeError( + f"GPTQ pack_factor={pack_factor} 不支持(需要 32 % pack_factor == 0)," + f"in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}" + ) + weight_bits = 32 // pack_factor + ops.gptq_shuffle(self.gptq_qweight, g_idx, weight_bits) + self._gptq_is_shuffled = torch.tensor(True, dtype=torch.bool, device=x.device) + + def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: + """Prepare vLLM GPTQ Marlin weights on first use (repack + permute scales/zp). + + IMPORTANT: This path must NOT call `gptq_shuffle` (that is specific to gptq_gemm/exllama). + """ + if self._offline_quant_format.numel() == 0: + return + if int(self._offline_quant_format.item()) != 1: + return + if self.gptq_qweight.numel() == 0: + return + if self._gptq_marlin_is_prepared.numel() > 0 and bool(self._gptq_marlin_is_prepared.item()): + return + + try: + from vllm import _custom_ops as ops # type: ignore + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore + marlin_make_empty_g_idx, + marlin_make_workspace_new, + marlin_permute_scales, + marlin_sort_g_idx, + marlin_zero_points, + unpack_cols, + ) + except Exception as e: # pragma: no cover + raise RuntimeError( + "GPTQ Marlin 需要 vLLM CUDA custom ops + marlin_utils,但当前环境不可用。" + ) from e + + device = x.device + if self.gptq_qweight.device != device: + raise RuntimeError( + f"GPTQ qweight device mismatch: qweight on {self.gptq_qweight.device}, x on {device}. " + "请确保模型与输入在同一设备。" + ) + + in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0 + out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0 + group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128 + if in_features <= 0 or out_features <= 0: + raise RuntimeError( + f"GPTQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}" + ) + + # Determine weight_bits. + # - Standard GPTQ layout: infer from qweight K packing. + # - Marlin-exported layout: bits cannot be inferred from qweight shape; use recorded bits. + weight_bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 + if weight_bits <= 0: + if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0: + raise RuntimeError( + "GPTQ Marlin: cannot infer pack_factor from qweight shape: " + f"in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}" + ) + pack_factor = in_features // int(self.gptq_qweight.shape[0]) + if 32 % pack_factor != 0: + raise RuntimeError( + f"GPTQ Marlin: unsupported pack_factor={pack_factor} (requires 32%pack_factor==0)" + ) + weight_bits = 32 // pack_factor + if weight_bits not in (4, 8): + raise RuntimeError( + f"GPTQ Marlin: only 4/8-bit are supported in this integration, got bits={weight_bits}" + ) + + # If loader already provided marlin-ready weights/scales (exported offline), + # skip repack/permute but still create workspace / g_idx metadata. + already_marlin_ready = ( + self.gptq_marlin_qweight.numel() > 0 + and self.gptq_marlin_scales.numel() > 0 + ) + if already_marlin_ready: + if self.gptq_marlin_qweight.device != device or self.gptq_marlin_scales.device != device: + raise RuntimeError( + "GPTQ Marlin: prepacked marlin tensors device mismatch: " + f"qweight on {self.gptq_marlin_qweight.device}, scales on {self.gptq_marlin_scales.device}, x on {device}." + ) + + # g_idx (act-order) handling: marlin expects sorted g_idx + sort indices; otherwise empty. + if self.gptq_g_idx.numel() > 0: + g_idx_sorted, g_idx_sort_indices = marlin_sort_g_idx(self.gptq_g_idx.to(device=device, dtype=torch.int32)) + self.gptq_marlin_g_idx = g_idx_sorted + self.gptq_marlin_g_idx_sort_indices = g_idx_sort_indices + else: + self.gptq_marlin_g_idx = marlin_make_empty_g_idx(device) + self.gptq_marlin_g_idx_sort_indices = marlin_make_empty_g_idx(device) + + # Workspace (internal locking mechanism). + self.gptq_marlin_workspace = marlin_make_workspace_new(device) + + if not already_marlin_ready: + # Repack qweight to marlin format. + self.gptq_marlin_qweight = ops.gptq_marlin_repack( + self.gptq_qweight.contiguous(), + perm=self.gptq_marlin_g_idx_sort_indices, + size_k=in_features, + size_n=out_features, + num_bits=weight_bits, + is_a_8bit=False, + ) + + # Permute scales to marlin format. + self.gptq_marlin_scales = marlin_permute_scales( + self.gptq_scales.contiguous(), + size_k=in_features, + size_n=out_features, + group_size=group_size, + is_a_8bit=False, + ) + + # GPTQ Marlin only supports symmetric weights (no runtime zero-points). + # Use empty zp to keep has_zp=False in the kernel. + self.gptq_marlin_zp = marlin_make_empty_g_idx(device) + + self._gptq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device) + + def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None: + """Prepare vLLM AWQ Marlin weights on first use (repack + permute scales/zp).""" + if self._offline_quant_format.numel() == 0: + return + if int(self._offline_quant_format.item()) != 2: + return + if self.awq_qweight.numel() == 0: + return + if self._awq_marlin_is_prepared.numel() > 0 and bool(self._awq_marlin_is_prepared.item()): + return + + try: + from vllm import _custom_ops as ops # type: ignore + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore + awq_to_marlin_zero_points, + marlin_make_empty_g_idx, + marlin_make_workspace_new, + marlin_permute_scales, + ) + except Exception as e: # pragma: no cover + raise RuntimeError( + "AWQ Marlin 需要 vLLM CUDA custom ops + marlin_utils,但当前环境不可用。" + ) from e + + device = x.device + if self.awq_qweight.device != device: + raise RuntimeError( + f"AWQ qweight device mismatch: qweight on {self.awq_qweight.device}, x on {device}. " + "请确保模型与输入在同一设备。" + ) + + in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0 + out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0 + group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128 + if in_features <= 0 or out_features <= 0: + raise RuntimeError( + f"AWQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}" + ) + + # AWQ is 4-bit only. + pack_factor = out_features // int(self.awq_qweight.shape[1]) + if pack_factor != 8: + raise RuntimeError(f"AWQ Marlin: expected pack_factor=8 (W4), got pack_factor={pack_factor}") + weight_bits = 4 + num_groups = (in_features // (in_features if group_size == -1 else group_size)) + + self.awq_marlin_workspace = marlin_make_workspace_new(device) + + # Repack qweight to marlin format. + self.awq_marlin_qweight = ops.awq_marlin_repack( + self.awq_qweight, + size_k=in_features, + size_n=out_features, + num_bits=weight_bits, + is_a_8bit=False, + ) + + # Permute scales to marlin format. + self.awq_marlin_scales = marlin_permute_scales( + self.awq_scales, + size_k=in_features, + size_n=out_features, + group_size=group_size, + is_a_8bit=False, + ) + + # Convert zero-points to marlin format. + self.awq_marlin_zp = awq_to_marlin_zero_points( + self.awq_qzeros, + size_k=num_groups, + size_n=out_features, + num_bits=weight_bits, + is_a_8bit=False, + ) + + # g_idx not used for AWQ marlin (keep empty, strategy will pass empties). + _ = marlin_make_empty_g_idx # keep import referenced for clarity + self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device) + def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: torch.Tensor) -> None: - # Support both int8 (for int8/int4 quantization) and uint8 (for FP8 quantization) - if quant_weight_int8.dtype not in (torch.int8, torch.uint8): - raise TypeError(f"quant_weight_int8 must be int8 or uint8, got {quant_weight_int8.dtype}") + # Support: + # - int8: int8/int4 weight-only quantization + # - float8: FP8 weight-only quantization (vLLM-aligned) + # - uint8: legacy FP8 storage (kept for backward compatibility) + fp8_dtypes: tuple[torch.dtype, ...] = tuple( + d + for d in ( + getattr(torch, "float8_e4m3fn", None), + getattr(torch, "float8_e4m3fnuz", None), + getattr(torch, "float8_e5m2", None), + getattr(torch, "float8_e5m2fnuz", None), + ) + if d is not None + ) + if quant_weight_int8.dtype not in (torch.int8, torch.uint8, *fp8_dtypes): + raise TypeError( + f"quant_weight_int8 must be int8/uint8/float8, got {quant_weight_int8.dtype}" + ) # Store scales dtype depends on strategy: # - W8A16/W4A16 kernels currently take bf16 scales. # - W8A8/W4A8 paths are more sensitive to scale precision; keep scales at fp16. @@ -237,6 +618,43 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to self.quant_scales = quant_scales self._weight_is_quantized.fill_(True) + def _maybe_promote_weight_to_quantized_at_runtime( + self, + x: torch.Tensor, + strategy, + *, + expected_weight_formats: tuple[str, ...] = ("int8", "int4", "fp8_e4m3", "fp8_e5m2"), + ) -> None: + """Runtime safety net: if a Linear is configured for quantization but the bf16/fp16 + weight Parameter was not quantized+removed at load-time (e.g., due to sharded load + ordering), quantize once on first forward and drop the bf16 weight Parameter. + + This avoids keeping both bf16 weights and quantized weights resident on GPU. + """ + if strategy is None: + return + if self.has_offline_quantized_weight() or self.has_quantized_weight(): + return + weight_param = self._parameters.get("weight", None) + if weight_param is None: + return + weight_format = getattr(strategy, "linear_weight_format", None) + if weight_format not in expected_weight_formats: + return + if getattr(strategy, "name", "").startswith("linear_stub"): + return + w = getattr(self, "weight", None) + if w is None or getattr(w, "dtype", None) not in (torch.bfloat16, torch.float16): + return + try: + qweight, scales = strategy.quantize_weight_for_kernel(w.data, device=w.data.device) + except Exception: + return + self.set_quantized_weight(qweight, scales) + # Drop bf16 weight Parameter to free GPU memory. + self._parameters.pop("weight", None) + setattr(self, "weight", None) + def _maybe_quantize_loaded_weight_param( self, param: nn.Parameter, @@ -322,6 +740,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): def forward(self, x: torch.Tensor) -> torch.Tensor: strategy = get_linear_strategy(self.quant_kind) + # Runtime safety net: ensure we don't keep bf16+quant weights both resident. + self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) # Check for offline quantized weights (GPTQ/AWQ) first if self.has_offline_quantized_weight(): @@ -331,6 +751,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out_features = int(self._offline_quant_out_features.item()) in_features = int(self._offline_quant_in_features.item()) group_size = int(self._offline_quant_group_size.item()) + weight_format = getattr(strategy, "linear_weight_format", None) kwargs = { "out_features": out_features, @@ -339,21 +760,60 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: } if format_val == 1: # GPTQ - kwargs.update({ - "gptq_qweight": self.gptq_qweight, - "gptq_qzeros": self.gptq_qzeros, - "gptq_scales": self.gptq_scales, - "gptq_group_size": group_size, - }) - if self.gptq_g_idx.numel() > 0: + # IMPORTANT: only gptq_gemm needs gptq_shuffle; marlin variants require the original format. + if weight_format == "gptq": + self._maybe_prepare_offline_gptq(x) + kwargs.update({ + "gptq_qweight": self.gptq_qweight, + "gptq_qzeros": self.gptq_qzeros, + "gptq_scales": self.gptq_scales, + "gptq_group_size": group_size, + }) + # Always pass g_idx (can be empty). vLLM expects it for GPTQ kernels. kwargs["gptq_g_idx"] = self.gptq_g_idx + elif weight_format == "gptq_marlin": + self._maybe_prepare_offline_gptq_marlin(x) + # Expose bits (needed to select scalar_types.* in strategy). + bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 + if bits <= 0: + pack_factor = in_features // int(self.gptq_qweight.shape[0]) + bits = 32 // pack_factor + kwargs["gptq_weight_bits"] = bits + kwargs.update({ + "gptq_marlin_qweight": self.gptq_marlin_qweight, + "gptq_marlin_scales": self.gptq_marlin_scales, + "gptq_marlin_zp": self.gptq_marlin_zp, + "gptq_marlin_g_idx": self.gptq_marlin_g_idx, + "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices, + "gptq_marlin_workspace": self.gptq_marlin_workspace, + }) + else: + raise RuntimeError( + f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} " + "is not compatible." + ) elif format_val == 2: # AWQ - kwargs.update({ - "awq_qweight": self.awq_qweight, - "awq_qzeros": self.awq_qzeros, - "awq_scales": self.awq_scales, - "awq_group_size": group_size, - }) + if weight_format == "awq": + kwargs.update({ + "awq_qweight": self.awq_qweight, + "awq_qzeros": self.awq_qzeros, + "awq_scales": self.awq_scales, + "awq_group_size": group_size, + }) + elif weight_format == "awq_marlin": + self._maybe_prepare_offline_awq_marlin(x) + kwargs.update({ + "awq_marlin_qweight": self.awq_marlin_qweight, + "awq_marlin_scales": self.awq_marlin_scales, + "awq_marlin_zp": self.awq_marlin_zp, + "awq_marlin_workspace": self.awq_marlin_workspace, + "awq_weight_bits": 4, + }) + else: + raise RuntimeError( + f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} " + "is not compatible." + ) base_out = strategy.linear_forward( x, @@ -427,6 +887,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): def forward(self, x: torch.Tensor) -> torch.Tensor: strategy = get_linear_strategy(self.quant_kind) + # Runtime safety net: ensure we don't keep bf16+quant weights both resident. + self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) # Check for offline quantized weights (GPTQ/AWQ) first if self.has_offline_quantized_weight(): @@ -436,6 +898,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out_features = int(self._offline_quant_out_features.item()) in_features = int(self._offline_quant_in_features.item()) group_size = int(self._offline_quant_group_size.item()) + weight_format = getattr(strategy, "linear_weight_format", None) kwargs = { "out_features": out_features, @@ -444,21 +907,57 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: } if format_val == 1: # GPTQ - kwargs.update({ - "gptq_qweight": self.gptq_qweight, - "gptq_qzeros": self.gptq_qzeros, - "gptq_scales": self.gptq_scales, - "gptq_group_size": group_size, - }) - if self.gptq_g_idx.numel() > 0: + if weight_format == "gptq": + self._maybe_prepare_offline_gptq(x) + kwargs.update({ + "gptq_qweight": self.gptq_qweight, + "gptq_qzeros": self.gptq_qzeros, + "gptq_scales": self.gptq_scales, + "gptq_group_size": group_size, + }) kwargs["gptq_g_idx"] = self.gptq_g_idx + elif weight_format == "gptq_marlin": + self._maybe_prepare_offline_gptq_marlin(x) + bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 + if bits <= 0: + pack_factor = in_features // int(self.gptq_qweight.shape[0]) + bits = 32 // pack_factor + kwargs["gptq_weight_bits"] = bits + kwargs.update({ + "gptq_marlin_qweight": self.gptq_marlin_qweight, + "gptq_marlin_scales": self.gptq_marlin_scales, + "gptq_marlin_zp": self.gptq_marlin_zp, + "gptq_marlin_g_idx": self.gptq_marlin_g_idx, + "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices, + "gptq_marlin_workspace": self.gptq_marlin_workspace, + }) + else: + raise RuntimeError( + f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} " + "is not compatible." + ) elif format_val == 2: # AWQ - kwargs.update({ - "awq_qweight": self.awq_qweight, - "awq_qzeros": self.awq_qzeros, - "awq_scales": self.awq_scales, - "awq_group_size": group_size, - }) + if weight_format == "awq": + kwargs.update({ + "awq_qweight": self.awq_qweight, + "awq_qzeros": self.awq_qzeros, + "awq_scales": self.awq_scales, + "awq_group_size": group_size, + }) + elif weight_format == "awq_marlin": + self._maybe_prepare_offline_awq_marlin(x) + kwargs.update({ + "awq_marlin_qweight": self.awq_marlin_qweight, + "awq_marlin_scales": self.awq_marlin_scales, + "awq_marlin_zp": self.awq_marlin_zp, + "awq_marlin_workspace": self.awq_marlin_workspace, + "awq_weight_bits": 4, + }) + else: + raise RuntimeError( + f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} " + "is not compatible." + ) base_out = strategy.linear_forward( x, @@ -609,6 +1108,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): def forward(self, x: torch.Tensor) -> torch.Tensor: bias = self.bias if self.tp_rank == 0 else None strategy = get_linear_strategy(self.quant_kind) + # Runtime safety net: ensure we don't keep bf16+quant weights both resident. + self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) # Check for offline quantized weights (GPTQ/AWQ) first if self.has_offline_quantized_weight(): @@ -618,6 +1119,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out_features = int(self._offline_quant_out_features.item()) in_features = int(self._offline_quant_in_features.item()) group_size = int(self._offline_quant_group_size.item()) + weight_format = getattr(strategy, "linear_weight_format", None) kwargs = { "out_features": out_features, @@ -626,21 +1128,59 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: } if format_val == 1: # GPTQ - kwargs.update({ - "gptq_qweight": self.gptq_qweight, - "gptq_qzeros": self.gptq_qzeros, - "gptq_scales": self.gptq_scales, - "gptq_group_size": group_size, - }) - if self.gptq_g_idx.numel() > 0: + if weight_format == "gptq": + # vLLM requires gptq_shuffle before first gptq_gemm. + self._maybe_prepare_offline_gptq(x) + kwargs.update({ + "gptq_qweight": self.gptq_qweight, + "gptq_qzeros": self.gptq_qzeros, + "gptq_scales": self.gptq_scales, + "gptq_group_size": group_size, + }) + # Always pass g_idx (can be empty); strategy will normalize dtype/device. kwargs["gptq_g_idx"] = self.gptq_g_idx + elif weight_format == "gptq_marlin": + self._maybe_prepare_offline_gptq_marlin(x) + bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 + if bits <= 0: + pack_factor = in_features // int(self.gptq_qweight.shape[0]) + bits = 32 // pack_factor + kwargs["gptq_weight_bits"] = bits + kwargs.update({ + "gptq_marlin_qweight": self.gptq_marlin_qweight, + "gptq_marlin_scales": self.gptq_marlin_scales, + "gptq_marlin_zp": self.gptq_marlin_zp, + "gptq_marlin_g_idx": self.gptq_marlin_g_idx, + "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices, + "gptq_marlin_workspace": self.gptq_marlin_workspace, + }) + else: + raise RuntimeError( + f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} " + "is not compatible." + ) elif format_val == 2: # AWQ - kwargs.update({ - "awq_qweight": self.awq_qweight, - "awq_qzeros": self.awq_qzeros, - "awq_scales": self.awq_scales, - "awq_group_size": group_size, - }) + if weight_format == "awq": + kwargs.update({ + "awq_qweight": self.awq_qweight, + "awq_qzeros": self.awq_qzeros, + "awq_scales": self.awq_scales, + "awq_group_size": group_size, + }) + elif weight_format == "awq_marlin": + self._maybe_prepare_offline_awq_marlin(x) + kwargs.update({ + "awq_marlin_qweight": self.awq_marlin_qweight, + "awq_marlin_scales": self.awq_marlin_scales, + "awq_marlin_zp": self.awq_marlin_zp, + "awq_marlin_workspace": self.awq_marlin_workspace, + "awq_weight_bits": 4, + }) + else: + raise RuntimeError( + f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} " + "is not compatible." + ) y = strategy.linear_forward( x, diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py index 7b2a151..fb608f9 100755 --- a/diffulex/utils/loader.py +++ b/diffulex/utils/loader.py @@ -12,6 +12,151 @@ logger = get_logger(__name__) +def _read_quantize_config(model_dir: str) -> dict: + """Read vLLM-style quantization metadata if present. + + We use this to detect checkpoint formats like `gptq_marlin` which reuse the same + tensor keys (qweight/qzeros/scales[/g_idx]) but have different semantics. + """ + cfg_path = os.path.join(model_dir, "quantize_config.json") + if not os.path.exists(cfg_path): + return {} + try: + with open(cfg_path, "r") as f: + data = json.load(f) + return data if isinstance(data, dict) else {} + except Exception: + return {} + + +def _make_packed_qzeros_constant( + *, + num_groups: int, + out_features: int, + bits: int, + device: torch.device | str, +) -> torch.Tensor: + """Create a GPTQ-style packed qzeros tensor filled with a constant. + + For vLLM GPTQ v1 checkpoints, zeros are stored as (zeros - 1) and then bit-packed + along the output dimension (N). For symmetric quantization, zeros is typically + bias=2^(bits-1), thus stored constant becomes (2^(bits-1) - 1). + + This is primarily used as a *shape-compatible dummy* when loading gptq_marlin + checkpoints where runtime zero-points are intentionally unused (qzeros may be empty). + """ + if bits not in (2, 4, 8): + raise ValueError(f"Unsupported bits={bits} for packed qzeros (expected 2/4/8)") + pack_factor = 32 // bits + if out_features % pack_factor != 0: + raise ValueError( + f"out_features={out_features} not divisible by pack_factor={pack_factor} for bits={bits}" + ) + out_packed = out_features // pack_factor + + # Stored constant for GPTQ v1: bias - 1, where bias = 2^(bits-1). + z = (1 << (bits - 1)) - 1 + packed_val = 0 + for i in range(pack_factor): + packed_val |= (z & ((1 << bits) - 1)) << (bits * i) + + return torch.full( + (int(num_groups), int(out_packed)), + int(packed_val), + dtype=torch.int32, + device=device, + ) + + +def _infer_module_device(module: nn.Module) -> torch.device: + w = getattr(module, "weight", None) + if isinstance(w, torch.Tensor): + return w.device + for p in module.parameters(recurse=False): + return p.device + for b in module.buffers(recurse=False): + return b.device + return torch.device("cpu") + + +def _set_offline_gptq_marlin_weight( + module: nn.Module, + *, + qweight: torch.Tensor, + scales: torch.Tensor, + out_features: int, + in_features: int, + group_size: int, + bits: int, + g_idx: torch.Tensor | None, +) -> None: + """Directly set GPTQ-Marlin-ready offline weights into a Diffulex Linear module. + + This bypasses `set_offline_quantized_weight` because marlin-exported `scales` + use a different layout (e.g. (2*num_groups, out_features/2)) and would fail + the standard GPTQ shape validation. + + We still populate minimal GPTQ metadata/buffers so Diffulex forward chooses + the offline path, and then `LinearBase._maybe_prepare_offline_gptq_marlin` + will only allocate workspace / g_idx metadata (and not repack/permute again). + """ + module_device = _infer_module_device(module) + if qweight.device != module_device: + qweight = qweight.to(device=module_device) + if scales.device != module_device: + scales = scales.to(device=module_device) + if g_idx is not None and g_idx.device != module_device: + g_idx = g_idx.to(device=module_device) + + pack_factor = 32 // int(bits) + group_size_norm = in_features if group_size == -1 else group_size + if group_size_norm <= 0 or in_features % group_size_norm != 0: + raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}") + num_groups = in_features // group_size_norm + + # Minimal qzeros to satisfy offline presence checks. (Marlin GPTQ symmetric doesn't use runtime zp.) + qzeros = _make_packed_qzeros_constant( + num_groups=num_groups, + out_features=out_features, + bits=int(bits), + device=module_device, + ) + + # Populate GPTQ buffers (note: scales here are marlin layout; gptq kernels should not be used). + module.gptq_qweight = qweight + module.gptq_qzeros = qzeros + module.gptq_scales = scales.to(dtype=torch.float16) + if g_idx is None: + module.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) + else: + if getattr(g_idx, "numel", lambda: 1)() == 0: + module.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) + else: + module.gptq_g_idx = g_idx.to(dtype=torch.int32) + + # Also mark as marlin-ready so LinearBase won't repack/permute again. + module.gptq_marlin_qweight = qweight + module.gptq_marlin_scales = module.gptq_scales + + module._offline_quant_format = torch.tensor(1, dtype=torch.int8, device=module_device) + module._offline_quant_bits = torch.tensor(int(bits), dtype=torch.int32, device=module_device) + module._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32, device=module_device) + module._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device) + module._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device) + module._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device) + + # Reset marlin-prep caches (workspace/zp/g_idx meta will be created on first forward). + module._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device) + module.gptq_marlin_zp = torch.empty(0, dtype=torch.int32, device=module_device) + module.gptq_marlin_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) + module.gptq_marlin_g_idx_sort_indices = torch.empty(0, dtype=torch.int32, device=module_device) + module.gptq_marlin_workspace = torch.empty(0, dtype=torch.int32, device=module_device) + + # Drop bf16 weight Parameter if present (to free memory and avoid accidental fallback). + if hasattr(module, "_parameters") and "weight" in module._parameters: + module._parameters.pop("weight", None) + setattr(module, "weight", None) + def load_lora_config(lora_path: str) -> dict: """Load LoRA configuration from adapter_config.json.""" @@ -61,9 +206,22 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # Check if model is configured for GPTQ or AWQ weight_attn_dtype = getattr(config, "linear_attn_weight_dtype", "bf16") or "bf16" weight_mlp_dtype = getattr(config, "linear_mlp_weight_dtype", "bf16") or "bf16" + quantize_cfg = _read_quantize_config(getattr(config, "model", "")) + checkpoint_format = (quantize_cfg.get("checkpoint_format") or "").strip().lower() + ckpt_bits = int(quantize_cfg.get("bits", 0) or 0) + ckpt_group_size = int(quantize_cfg.get("group_size", 0) or 0) - use_gptq = weight_attn_dtype.lower() == "gptq" or weight_mlp_dtype.lower() == "gptq" - use_awq = weight_attn_dtype.lower() == "awq" or weight_mlp_dtype.lower() == "awq" + # NOTE: marlin variants reuse the same offline GPTQ/AWQ checkpoint keys + # (qweight/qzeros/scales[/g_idx]) and are repacked lazily in `LinearBase` + # on first forward. + gptq_dtypes = {"gptq", "gptq_marlin"} + awq_dtypes = {"awq", "awq_marlin"} + use_gptq = (weight_attn_dtype or "").lower() in gptq_dtypes or (weight_mlp_dtype or "").lower() in gptq_dtypes + use_awq = (weight_attn_dtype or "").lower() in awq_dtypes or (weight_mlp_dtype or "").lower() in awq_dtypes + want_gptq_marlin = (weight_attn_dtype or "").lower() == "gptq_marlin" or (weight_mlp_dtype or "").lower() == "gptq_marlin" + want_awq_marlin = (weight_attn_dtype or "").lower() == "awq_marlin" or (weight_mlp_dtype or "").lower() == "awq_marlin" + is_gptq_marlin_ckpt = checkpoint_format == "gptq_marlin" + is_awq_marlin_ckpt = checkpoint_format == "awq_marlin" if not (use_gptq or use_awq): return loaded_gptq, loaded_awq, skipped @@ -145,13 +303,14 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # Determine format: check if g_idx exists (GPTQ) or not (AWQ) has_g_idx = "g_idx" in key_dict - if has_g_idx and use_gptq: + is_gptq_keyset = has_g_idx or is_gptq_marlin_ckpt + if is_gptq_keyset and use_gptq: format = "gptq" - elif not has_g_idx and use_awq: + elif (not is_gptq_keyset) and use_awq: format = "awq" else: # Prefer GPTQ if both are enabled and g_idx exists - format = "gptq" if (use_gptq and has_g_idx) else ("awq" if use_awq else None) + format = "gptq" if (use_gptq and is_gptq_keyset) else ("awq" if use_awq else None) if format is None: skipped += 1 @@ -183,47 +342,267 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): skipped += 1 continue - # Infer dimensions from tensor shapes - out_features, packed_in = qweight.shape - in_features = packed_in * 2 # Packed int4: 2 values per byte (max estimate) - # Refine in_features from scales shape if available - if scales.shape[1:] != (): - # scales is [num_groups, in_features] or [num_groups] - if len(scales.shape) == 2: - in_features = scales.shape[1] - - # Default group_size for GPTQ/AWQ is 128 + # Infer dimensions from tensor shapes (vLLM standard format) WITHOUT + # assuming bits=4. This enables GPTQ W2/W4/W8 checkpoints. + if format == "gptq": + if is_gptq_marlin_ckpt: + # gptq_marlin export uses Marlin repacked qweight/scales layouts. + # Empirically (vLLM marlin): qweight is packed on K in tiles of 16, + # so qweight.shape[0] == in_features / 16; and scales carries original N. + out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1]) + in_features = int(qweight.shape[0]) * 16 + if ckpt_bits not in (4, 8): + print( + f"Warning: gptq_marlin requires bits=4/8, got bits={ckpt_bits} for {module_name}. Skipping." + ) + skipped += 1 + continue + # Keep pack_factor for dummy qzeros creation later. + pack_factor = 32 // int(ckpt_bits) + else: + # Standard GPTQ: qweight [K/pack, N] + out_features = int(qweight.shape[1]) + # qzeros: [K/group, N/pack] (may be empty for some checkpoints) + if getattr(qzeros, "numel", lambda: 1)() == 0: + if ckpt_bits not in (2, 4, 8): + print( + f"Warning: qzeros is empty and cannot infer bits for {module_name}. " + f"Please ensure quantize_config.json contains bits (2/4/8). Skipping." + ) + skipped += 1 + continue + pack_factor = 32 // int(ckpt_bits) + else: + if int(qzeros.shape[1]) <= 0 or out_features % int(qzeros.shape[1]) != 0: + print( + f"Warning: Cannot infer GPTQ pack_factor from qzeros for {module_name}: " + f"qzeros.shape={tuple(qzeros.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping." + ) + skipped += 1 + continue + pack_factor = out_features // int(qzeros.shape[1]) # 32 / bits + in_features = int(qweight.shape[0]) * pack_factor + else: + # awq: qweight: [K, N/pack], scales: [K/group, N] + out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1]) + if int(qweight.shape[1]) <= 0 or out_features % int(qweight.shape[1]) != 0: + print( + f"Warning: Cannot infer AWQ pack_factor from scales/qweight for {module_name}: " + f"scales.shape={tuple(scales.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping." + ) + skipped += 1 + continue + pack_factor = out_features // int(qweight.shape[1]) # 32 / bits (expected 8 for AWQ 4-bit) + in_features = int(qweight.shape[0]) + + # Infer group_size from qzeros/scales. + # qzeros/scales are groupwise on K (in_features). group_size = 128 - # Infer group_size from scales/qzeros shape - num_groups = qzeros.shape[0] - if num_groups > 0: - estimated_group_size = (out_features + num_groups - 1) // num_groups - if estimated_group_size > 0: - group_size = estimated_group_size + if ckpt_group_size not in (0, None): + # quantize_config.json stores actual group_size (may be -1) + group_size = int(ckpt_group_size) + else: + if is_gptq_marlin_ckpt and len(scales.shape) == 2 and int(scales.shape[0]) > 0: + # marlin scales often use first dim = 2 * num_groups + num_groups = int(scales.shape[0]) // 2 + if num_groups > 0 and in_features % num_groups == 0: + group_size = in_features // num_groups + else: + num_groups = int(qzeros.shape[0]) if getattr(qzeros, "numel", lambda: 1)() > 0 else 0 + if num_groups > 0 and in_features % num_groups == 0: + group_size = in_features // num_groups + elif len(scales.shape) == 2 and int(scales.shape[0]) > 0 and in_features % int(scales.shape[0]) == 0: + group_size = in_features // int(scales.shape[0]) + + # For gptq_marlin checkpoints qzeros may be empty; create a shape-compatible dummy + # packed qzeros so LinearBase considers offline weights present. + if ( + format == "gptq" + and getattr(qzeros, "numel", lambda: 1)() == 0 + and (want_gptq_marlin or is_gptq_marlin_ckpt) + and ckpt_bits in (2, 4, 8) + ): + group_size_norm = in_features if group_size == -1 else group_size + if group_size_norm <= 0 or (in_features % group_size_norm) != 0: + print( + f"Warning: Invalid group_size={group_size} for {module_name} with in_features={in_features}. " + "Skipping." + ) + skipped += 1 + continue + num_groups = in_features // group_size_norm + try: + qzeros = _make_packed_qzeros_constant( + num_groups=num_groups, + out_features=out_features, + bits=int(ckpt_bits), + device=qweight.device, + ) + except Exception as e: + print(f"Warning: Failed to create dummy qzeros for {module_name}: {e}. Skipping.") + skipped += 1 + continue - # Handle tensor parallel: if tp_size > 1, we need to handle sharding - # For MVP, only support TP=1 (tensor_parallel_size=1) - tp_size = getattr(module, "tp_size", 1) + # Handle tensor parallel sharding (TP>1). + # ColumnParallelLinear: tp_dim=0 (shard N/out_features) + # RowParallelLinear : tp_dim=1 (shard K/in_features) + tp_size = int(getattr(module, "tp_size", 1) or 1) + tp_rank = int(getattr(module, "tp_rank", 0) or 0) + tp_dim = getattr(module, "tp_dim", None) if tp_size > 1: - print( - f"Warning: Tensor parallel (TP={tp_size}) is not fully supported for offline quantized weights. " - f"Skipping {module_name}. Please provide a TP=1 checkpoint or implement TP sharding logic." - ) - skipped += 1 - continue + if tp_dim not in (0, 1): + print( + f"Warning: Unsupported tp_dim={tp_dim} for offline quantized weights. " + f"Skipping {module_name}." + ) + skipped += 1 + continue + + # Shard along output features (N) for column-parallel modules. + if tp_dim == 0: + if out_features % tp_size != 0: + print( + f"Warning: out_features={out_features} not divisible by TP={tp_size} for {module_name}. " + "Skipping offline quant weights for this module." + ) + skipped += 1 + continue + out_per = out_features // tp_size + out_start = tp_rank * out_per + out_end = out_start + out_per + if out_per % pack_factor != 0: + print( + f"Warning: out_features_per_partition={out_per} not divisible by pack_factor={pack_factor} " + f"for {module_name}. Skipping." + ) + skipped += 1 + continue + out_packed_per = out_per // pack_factor + out_packed_start = out_start // pack_factor + out_packed_end = out_packed_start + out_packed_per + + if format == "gptq": + if is_gptq_marlin_ckpt: + # Marlin qweight packs N by a factor (bits/2): N_packed = N * (bits/2) + n_factor = int(ckpt_bits) // 2 + if n_factor <= 0: + print(f"Warning: invalid gptq_marlin n_factor for bits={ckpt_bits} ({module_name}). Skipping.") + skipped += 1 + continue + qweight = qweight[:, (out_start * n_factor):(out_end * n_factor)] + # scales keep original N + scales = scales[:, out_start:out_end] + # qzeros stays dummy/empty; g_idx stays on K. + out_features = out_per + else: + # qweight: [K/pack, N] + qweight = qweight[:, out_start:out_end] + # qzeros: [K/group, N/pack] + qzeros = qzeros[:, out_packed_start:out_packed_end] + # scales: [K/group, N] + scales = scales[:, out_start:out_end] + out_features = out_per + else: + # awq qweight: [K, N/pack] + qweight = qweight[:, out_packed_start:out_packed_end] + qzeros = qzeros[:, out_packed_start:out_packed_end] + scales = scales[:, out_start:out_end] + out_features = out_per + + # Shard along input features (K) for row-parallel modules. + elif tp_dim == 1: + if in_features % tp_size != 0: + print( + f"Warning: in_features={in_features} not divisible by TP={tp_size} for {module_name}. " + "Skipping offline quant weights for this module." + ) + skipped += 1 + continue + in_per = in_features // tp_size + in_start = tp_rank * in_per + in_end = in_start + in_per + if group_size <= 0 or (in_per % group_size) != 0 or (in_start % group_size) != 0: + print( + f"Warning: group_size={group_size} incompatible with TP sharding for {module_name} " + f"(in_per={in_per}, in_start={in_start}). Skipping." + ) + skipped += 1 + continue + g_start = in_start // group_size + g_end = in_end // group_size + + if format == "gptq": + if is_gptq_marlin_ckpt: + # Marlin qweight packs K in tiles of 16: K_packed = K / 16 + if in_start % 16 != 0: + print( + f"Warning: gptq_marlin requires in_start divisible by 16, got in_start={in_start} " + f"for {module_name}. Skipping." + ) + skipped += 1 + continue + q_start = in_start // 16 + q_end = in_end // 16 + qweight = qweight[q_start:q_end, :] + # scales first dim is typically 2*num_groups + scales = scales[(2 * g_start):(2 * g_end), :] + if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() > 0: + g_idx = g_idx[in_start:in_end] + in_features = in_per + else: + # qweight: [K/pack, N] (packed on K) + if in_start % pack_factor != 0: + print( + f"Warning: in_start={in_start} not divisible by pack_factor={pack_factor} " + f"for {module_name}. Skipping." + ) + skipped += 1 + continue + q_start = in_start // pack_factor + q_end = in_end // pack_factor + qweight = qweight[q_start:q_end, :] + qzeros = qzeros[g_start:g_end, :] + scales = scales[g_start:g_end, :] + if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() > 0: + g_idx = g_idx[in_start:in_end] + in_features = in_per + else: + # awq qweight: [K, N/pack] + qweight = qweight[in_start:in_end, :] + qzeros = qzeros[g_start:g_end, :] + scales = scales[g_start:g_end, :] + in_features = in_per + # Treat empty g_idx as "not provided" for GPTQ (desc_act=False checkpoints often store empty). + if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() == 0: + g_idx = None + # Set offline quantized weight try: - module.set_offline_quantized_weight( - format=format, - qweight=qweight, - qzeros=qzeros, - scales=scales, - out_features=out_features, - in_features=in_features, - group_size=group_size, - g_idx=g_idx, - ) + if format == "gptq" and is_gptq_marlin_ckpt: + if ckpt_bits not in (4, 8): + raise ValueError(f"gptq_marlin checkpoint requires bits=4/8, got bits={ckpt_bits}") + _set_offline_gptq_marlin_weight( + module, + qweight=qweight, + scales=scales, + out_features=out_features, + in_features=in_features, + group_size=group_size, + bits=int(ckpt_bits), + g_idx=g_idx, + ) + else: + module.set_offline_quantized_weight( + format=format, + qweight=qweight, + qzeros=qzeros, + scales=scales, + out_features=out_features, + in_features=in_features, + group_size=group_size, + g_idx=g_idx, + ) if format == "gptq": loaded_gptq += 1 else: diff --git a/diffulex/utils/quantization/quantize_model.py b/diffulex/utils/quantization/quantize_model.py index b82710f..bd77977 100644 --- a/diffulex/utils/quantization/quantize_model.py +++ b/diffulex/utils/quantization/quantize_model.py @@ -1,15 +1,16 @@ #!/usr/bin/env python3 -"""离线量化脚本:将模型权重量化为 GPTQ/AWQ 格式 +"""离线量化脚本:将模型权重量化为 vLLM 标准 GPTQ/AWQ 格式 -支持两种量化格式: -- GPTQ: Groupwise quantization with optional g_idx -- AWQ: Groupwise quantization (no g_idx) +支持两种量化格式(对齐 vLLM 权重格式): +- GPTQ: qweight/qzeros 为 int32 packed,scales 为 fp16,g_idx 可选(常见 desc_act=False 时为空) +- GPTQ_MARLIN: 导出 Marlin-ready 的 GPTQ 权重布局(qweight 已 repack,scales 已 permute,zp 为空) +- AWQ : qweight/qzeros 为 int32 packed,scales 为 fp16 使用方法: python -m diffulex.utils.quantization.quantize_model \ --model-path /path/to/model \ --output-path /path/to/output \ - --quant-format gptq \ + --quant-format gptq_marlin \ --group-size 128 \ --bits 4 """ @@ -41,193 +42,179 @@ from glob import glob -def _pack_int4_to_int8(int4_tensor: torch.Tensor) -> torch.Tensor: - """Pack int4 tensor into int8 format. - - Args: - int4_tensor: int8 tensor [N, K] with values in [-8, 7] - - Returns: - packed: int8 tensor [N, (K + 1) // 2] with 2 int4 values per byte - """ - out_features, in_features = int4_tensor.shape - - # Clamp to int4 range [-8, 7] - int4_tensor = int4_tensor.clamp(-8, 7) - - # Convert to unsigned: [-8, 7] -> [0, 15] - uint8_tensor = (int4_tensor + 8).to(torch.uint8) - - # Pad to even number of columns if needed - if in_features % 2 != 0: - pad_size = 1 - padding = torch.zeros(out_features, pad_size, dtype=torch.uint8, device=uint8_tensor.device) + 8 - uint8_tensor = torch.cat([uint8_tensor, padding], dim=1) - padded_in_features = in_features + pad_size - else: - padded_in_features = in_features - - # Reshape to [N, K//2, 2] where first column is even indices, second is odd indices - reshaped = uint8_tensor.view(out_features, padded_in_features // 2, 2) - - # Pack: lower 4 bits = even columns, upper 4 bits = odd columns - packed = reshaped[:, :, 0] | (reshaped[:, :, 1] << 4) - return packed.to(torch.int8) +def _require_vllm(): + try: + from vllm.scalar_type import scalar_types # type: ignore + from vllm.model_executor.layers.quantization.utils.quant_utils import ( # type: ignore + awq_pack, + gptq_pack, + pack_cols, + quantize_weights, + ) + except Exception as e: # pragma: no cover + raise RuntimeError( + "离线 GPTQ/AWQ 打包已切换到 vLLM 标准格式,需要可 import 的 vLLM。" + ) from e + return scalar_types, quantize_weights, gptq_pack, awq_pack, pack_cols -def _quantize_gptq_groupwise( - weight: torch.Tensor, - group_size: int = 128, - bits: int = 4, - g_idx: Optional[torch.Tensor] = None, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - """Quantize weight using GPTQ groupwise quantization. - - Args: - weight: float32 tensor [out_features, in_features] - group_size: Group size for quantization (default: 128) - bits: Number of bits per weight (default: 4) - g_idx: Optional int32 tensor [out_features] mapping each output channel to its group. - If None, uses sequential grouping: group_id = out_idx // group_size - - Returns: - qweight: int8 packed int4 weights [out_features, (in_features + 1) // 2] - qzeros: int8 packed int4 zeros [num_groups, (in_features + 1) // 2] - scales: float32 per-group scales [num_groups, in_features] - g_idx: int32 tensor [out_features] group indices (always returned, even if input was None) +def _require_vllm_marlin(): + # Marlin 预处理依赖 CUDA custom ops + try: + from vllm import _custom_ops as ops # type: ignore + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore + marlin_permute_scales, + ) + except Exception as e: # pragma: no cover + raise RuntimeError( + "导出 gptq_marlin 格式需要可 import 的 vLLM Marlin(含 CUDA custom ops)。" + ) from e + return ops, marlin_permute_scales + + +def _quantize_to_vllm_gptq( + weight: torch.Tensor, *, group_size: int, bits: int, use_v2_format: bool = False +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Quantize and pack weights into vLLM GPTQ checkpoint format. + + Input: + weight: fp32 [N, K] (PyTorch Linear weight) + Output (vLLM format): + qweight: int32 [K/pack, N] + qzeros : int32 [K/group, N/pack] (GPTQ v1 stores (zeros - 1); v2 stores zeros) + scales : fp16 [K/group, N] + g_idx : int32 empty tensor (desc_act=False) """ - out_features, in_features = weight.shape - device = weight.device - - # Determine group assignments - if g_idx is None: - # Sequential grouping: group_id = out_idx // group_size - group_ids = torch.arange(out_features, device=device) // group_size - else: - # Use provided g_idx - if g_idx.shape != (out_features,): - raise ValueError(f"g_idx shape mismatch: got {g_idx.shape}, expected ({out_features},)") - group_ids = g_idx.to(device=device).to(torch.int64) - - num_groups = int(group_ids.max().item() + 1) - - # Quantize per group - qweight_list = [] - qzeros_list = [] - scales_list = [] - - for g in range(num_groups): - # Get output channels in this group - group_mask = (group_ids == g) - group_indices = torch.where(group_mask)[0] - - if len(group_indices) == 0: - continue - - group_weight = weight[group_indices] # [group_out_size, in_features] - group_out_size = group_weight.shape[0] - - # Compute scale and zero point per input feature (per-channel within group) - # For GPTQ, we use per-channel quantization within each group - abs_max = torch.abs(group_weight).max(dim=0, keepdim=True)[0] # [1, in_features] - scales_group = (abs_max.clamp(min=1e-8) / (2 ** (bits - 1) - 1)).squeeze(0) # [in_features] - - # Compute zero point: mean of group (per-channel) - zeros_group = group_weight.mean(dim=0) # [in_features] - - # Quantize: (weight - zero) / scale - quantized_group = ((group_weight - zeros_group.unsqueeze(0)) / scales_group.unsqueeze(0).clamp(min=1e-8)) - quantized_group = quantized_group.round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8) - - # Pack quantized weights - packed_group = _pack_int4_to_int8(quantized_group) # [group_out_size, (in_features + 1) // 2] - qweight_list.append(packed_group) - - # Quantize and pack zeros - zeros_quantized = (zeros_group / scales_group.clamp(min=1e-8)).round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8) - zeros_packed = _pack_int4_to_int8(zeros_quantized.unsqueeze(0)) # [1, (in_features + 1) // 2] - qzeros_list.append(zeros_packed) - - # Store scales - scales_list.append(scales_group.unsqueeze(0)) # [1, in_features] - - # Concatenate all groups - qweight = torch.cat(qweight_list, dim=0) # [out_features, (in_features + 1) // 2] - qzeros = torch.cat(qzeros_list, dim=0) # [num_groups, (in_features + 1) // 2] - scales = torch.cat(scales_list, dim=0) # [num_groups, in_features] - - # Ensure g_idx is returned (create if was None) - if g_idx is None: - g_idx = group_ids.to(torch.int32) - else: - g_idx = g_idx.to(torch.int32) - + scalar_types, quantize_weights, gptq_pack, _, pack_cols = _require_vllm() + # vLLM GPTQConfig mentions 2/3/4/8, but the standard vLLM int32 packing + # used by `gptq_pack/pack_cols` requires 32 % bits == 0. + # So we support 2/4/8 here; 3-bit would need a different packing scheme. + if bits not in (2, 4, 8): + raise ValueError( + f"GPTQ bits 仅支持 2/4/8(vLLM 标准 int32 pack 要求 32%bits==0),当前 bits={bits}" + ) + + # vLLM operates on (K, N) + w = weight.T.contiguous() + size_k, size_n = w.shape + group_size_norm = size_k if group_size == -1 else group_size + if group_size_norm <= 0 or size_k % group_size_norm != 0: + raise ValueError(f"Invalid group_size={group_size} for in_features={size_k}") + + if bits == 2: + quant_type = scalar_types.uint2b2 + elif bits == 4: + quant_type = scalar_types.uint4b8 + else: # bits == 8 + quant_type = scalar_types.uint8b128 + + _, w_q, w_s, _ = quantize_weights(w, quant_type, group_size_norm, zero_points=False) + + pack_factor = 32 // bits + qweight = gptq_pack(w_q, bits, size_k, size_n).contiguous() # [K/pack, N] + + num_groups = size_k // group_size_norm + zeros = torch.full( + (num_groups, size_n), + int(getattr(quant_type, "bias", 0)), + dtype=torch.int32, + device=w.device, + ) + # GPTQ v1 stores zeros-1 in the checkpoint. + zeros_to_store = zeros if use_v2_format else (zeros - 1) + qzeros = pack_cols(zeros_to_store, bits, num_groups, size_n).contiguous() # [K/group, N/pack] + + scales = w_s.to(torch.float16).contiguous() # [K/group, N] + g_idx = torch.empty((0,), dtype=torch.int32, device=w.device) return qweight, qzeros, scales, g_idx -def _quantize_awq_groupwise( - weight: torch.Tensor, - group_size: int = 128, - bits: int = 4, +def _quantize_to_vllm_gptq_marlin( + weight: torch.Tensor, *, group_size: int, bits: int +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Quantize weights and export marlin-ready GPTQ layout. + + 该导出格式对齐 vLLM `MarlinLinearKernel.process_weights_after_loading` 的结果: + - qweight: 已执行 `gptq_marlin_repack` + - scales : 已执行 `marlin_permute_scales` + - qzeros : 置空(Marlin GPTQ symmetric 路径不使用 runtime zp) + - g_idx : 空(desc_act=False) + + 注意:需要在 CUDA 上执行(`gptq_marlin_repack` 为 CUDA op)。 + """ + if weight.device.type != "cuda": + raise ValueError("gptq_marlin 导出需要 device=cuda(Marlin repack 为 CUDA op)") + + ops, marlin_permute_scales = _require_vllm_marlin() + + # 先按 vLLM 标准 GPTQ(symmetric, zero_points=False)量化并打包 + qweight, _qzeros, scales, g_idx = _quantize_to_vllm_gptq( + weight, group_size=group_size, bits=bits, use_v2_format=False + ) + + # vLLM GPTQ packing 的 shape 基于 w=(K,N);这里 size_k=in_features, size_n=out_features + size_k = weight.shape[1] + size_n = weight.shape[0] + group_size_norm = size_k if group_size == -1 else group_size + + # desc_act=False 时 perm 为空 + empty_perm = torch.empty((0,), dtype=torch.int32, device=weight.device) + + marlin_qweight = ops.gptq_marlin_repack( + qweight.contiguous(), + perm=empty_perm, + size_k=size_k, + size_n=size_n, + num_bits=bits, + is_a_8bit=False, + ).contiguous() + + marlin_scales = marlin_permute_scales( + scales.contiguous(), + size_k=size_k, + size_n=size_n, + group_size=group_size_norm, + is_a_8bit=False, + ).contiguous() + + # Marlin GPTQ symmetric 不使用 runtime zero points,导出空 qzeros 保持一致性 + marlin_qzeros = torch.empty((0,), dtype=torch.int32, device=weight.device) + marlin_g_idx = g_idx # already empty + + return marlin_qweight, marlin_qzeros, marlin_scales, marlin_g_idx + + +def _quantize_to_vllm_awq( + weight: torch.Tensor, *, group_size: int, bits: int ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Quantize weight using AWQ groupwise quantization. - - Args: - weight: float32 tensor [out_features, in_features] - group_size: Group size for quantization (default: 128) - bits: Number of bits per weight (default: 4) - - Returns: - qweight: int8 packed int4 weights [out_features, (in_features + 1) // 2] - qzeros: int8 packed int4 zeros [num_groups, (in_features + 1) // 2] - scales: float32 per-group scales [num_groups, in_features] or [num_groups] + """Quantize and pack weights into vLLM AWQ checkpoint format. + + Input: + weight: fp32 [N, K] + Output (vLLM format): + qweight: int32 [K, N/pack] + qzeros : int32 [K/group, N/pack] + scales : fp16 [K/group, N] """ - out_features, in_features = weight.shape - device = weight.device - - num_groups = (out_features + group_size - 1) // group_size - - # Quantize per group (sequential grouping) - qweight_list = [] - qzeros_list = [] - scales_list = [] - - for g in range(num_groups): - start_idx = g * group_size - end_idx = min((g + 1) * group_size, out_features) - group_weight = weight[start_idx:end_idx] # [group_size (or remainder), in_features] - group_out_size = group_weight.shape[0] - - # AWQ: Compute scale per group (can be scalar or per-channel) - # For simplicity, use per-channel scales within group - abs_max = torch.abs(group_weight).max(dim=0, keepdim=True)[0] # [1, in_features] - scales_group = (abs_max.clamp(min=1e-8) / (2 ** (bits - 1) - 1)).squeeze(0) # [in_features] - - # AWQ: Compute zero point per input channel (per-channel) - # Use minimum value for better quantization range - zeros_group = group_weight.min(dim=0)[0] # [in_features] - - # Quantize: (weight - zero) / scale - quantized_group = ((group_weight - zeros_group.unsqueeze(0)) / scales_group.unsqueeze(0).clamp(min=1e-8)) - quantized_group = quantized_group.round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8) - - # Pack quantized weights - packed_group = _pack_int4_to_int8(quantized_group) # [group_out_size, (in_features + 1) // 2] - qweight_list.append(packed_group) - - # Quantize and pack zeros - zeros_quantized = (zeros_group / scales_group.clamp(min=1e-8)).round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8) - zeros_packed = _pack_int4_to_int8(zeros_quantized.unsqueeze(0)) # [1, (in_features + 1) // 2] - qzeros_list.append(zeros_packed) - - # Store scales - scales_list.append(scales_group.unsqueeze(0)) # [1, in_features] - - # Concatenate all groups - qweight = torch.cat(qweight_list, dim=0) # [out_features, (in_features + 1) // 2] - qzeros = torch.cat(qzeros_list, dim=0) # [num_groups, (in_features + 1) // 2] - scales = torch.cat(scales_list, dim=0) # [num_groups, in_features] - + scalar_types, quantize_weights, _, awq_pack, _ = _require_vllm() + if bits != 4: + raise ValueError(f"AWQ 目前仅支持 4-bit,当前 bits={bits}") + + w = weight.T.contiguous() + size_k, size_n = w.shape + group_size_norm = size_k if group_size == -1 else group_size + if group_size_norm <= 0 or size_k % group_size_norm != 0: + raise ValueError(f"Invalid group_size={group_size} for in_features={size_k}") + + quant_type = scalar_types.uint4 + _, w_q, w_s, w_zp = quantize_weights(w, quant_type, group_size_norm, zero_points=True) + if w_zp is None: + raise RuntimeError("AWQ zero_points=True 但未生成 zero points,vLLM 量化返回异常。") + + qweight = awq_pack(w_q, bits, size_k, size_n).contiguous() # [K, N/pack] + num_groups = size_k // group_size_norm + qzeros = awq_pack(w_zp.to(torch.int32), bits, num_groups, size_n).contiguous() # [K/group, N/pack] + scales = w_s.to(torch.float16).contiguous() # [K/group, N] return qweight, qzeros, scales @@ -252,8 +239,10 @@ def quantize_model( If None, quantizes all linear layers. device: Device to use for quantization ("cpu" or "cuda") """ - if quant_format not in ["gptq", "awq"]: - raise ValueError(f"Unsupported quant_format: {quant_format}. Must be 'gptq' or 'awq'") + if quant_format not in ["gptq", "gptq_marlin", "awq"]: + raise ValueError( + f"Unsupported quant_format: {quant_format}. Must be 'gptq', 'gptq_marlin' or 'awq'" + ) output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) @@ -327,29 +316,27 @@ def quantize_model( weight_fp32 = weight.to(torch.float32).to(device) # Quantize + prefix = key[:-7] # Remove ".weight" if quant_format == "gptq": - qweight, qzeros, scales, g_idx = _quantize_gptq_groupwise( - weight_fp32, group_size=group_size, bits=bits, g_idx=None + qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq( + weight_fp32, group_size=group_size, bits=bits, use_v2_format=False + ) + elif quant_format == "gptq_marlin": + qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq_marlin( + weight_fp32, group_size=group_size, bits=bits ) - # Save quantized weights with module prefix - prefix = key[:-7] # Remove ".weight" quantized_weights[f"{prefix}.qweight"] = qweight.cpu() quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu() quantized_weights[f"{prefix}.scales"] = scales.cpu() + # Keep g_idx key for compatibility (often empty when desc_act=False). quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu() - quantized_weights[f"{prefix}.group_size"] = torch.tensor(group_size, dtype=torch.int32) - quantized_weights[f"{prefix}.bits"] = torch.tensor(bits, dtype=torch.int32) else: # awq - qweight, qzeros, scales = _quantize_awq_groupwise( + qweight, qzeros, scales = _quantize_to_vllm_awq( weight_fp32, group_size=group_size, bits=bits ) - # Save quantized weights with module prefix - prefix = key[:-7] # Remove ".weight" quantized_weights[f"{prefix}.qweight"] = qweight.cpu() quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu() quantized_weights[f"{prefix}.scales"] = scales.cpu() - quantized_weights[f"{prefix}.group_size"] = torch.tensor(group_size, dtype=torch.int32) - quantized_weights[f"{prefix}.bits"] = torch.tensor(bits, dtype=torch.int32) metadata["quantized_modules"].append({ "name": prefix, @@ -391,6 +378,20 @@ def quantize_model( metadata_file = output_path / f"quantization_metadata_{quant_format}.json" with open(metadata_file, "w") as f: json.dump(metadata, f, indent=2) + + # vLLM GPTQ/GPTQ-Marlin 会读取 quantize_config.json + # - gptq_marlin: 需要 sym/desc_act 等字段用于识别并选择 Marlin kernel + if quant_format == "gptq_marlin": + quantize_cfg = { + "bits": int(bits), + "group_size": int(group_size), + "desc_act": False, + "sym": True, + "lm_head": False, + "checkpoint_format": "gptq_marlin", + } + with open(output_path / "quantize_config.json", "w") as f: + json.dump(quantize_cfg, f, indent=2) print(f"\n✓ Quantization complete!") print(f" - Quantized {len(metadata['quantized_modules'])} modules") @@ -408,7 +409,13 @@ def main(): ) parser.add_argument("--model-path", type=str, required=True, help="输入模型路径") parser.add_argument("--output-path", type=str, required=True, help="输出路径") - parser.add_argument("--quant-format", type=str, choices=["gptq", "awq"], default="gptq", help="量化格式: gptq 或 awq") + parser.add_argument( + "--quant-format", + type=str, + choices=["gptq", "gptq_marlin", "awq"], + default="gptq", + help="量化格式: gptq / gptq_marlin / awq", + ) parser.add_argument("--group-size", type=int, default=128, help="量化组大小 (默认: 128)") parser.add_argument("--bits", type=int, default=4, help="每个权重的位数 (默认: 4)") parser.add_argument("--target-modules", type=str, help="要量化的模块名称模式(逗号分隔),例如: q_proj,k_proj,v_proj") diff --git a/diffulex/utils/quantization/registry.py b/diffulex/utils/quantization/registry.py index eec11ea..0b7be63 100644 --- a/diffulex/utils/quantization/registry.py +++ b/diffulex/utils/quantization/registry.py @@ -84,10 +84,14 @@ def _normalize_linear_dtype(dtype: str) -> str: "e5m2": "fp8_e5m2", # Weight-only methods (placeholders) "gptq": "gptq", + "gptq_marlin": "gptq_marlin", + "gptq_marlin_24": "gptq_marlin_24", "awq": "awq", + "awq_marlin": "awq_marlin", "gptq_awq": "gptq_awq", - # vLLM-style fused W8A16 path (Diffulex vendored): user-facing alias "marlin" - # Normalized key is "marlin_int8" to avoid conflating with other quant methods. + # vLLM-style fused W8A16 path (AllSpark): keep user-facing alias "marlin" + # for backward compatibility. Normalized key is "marlin_int8" to avoid + # conflating with other quant methods. "marlin": "marlin_int8", "marlin_int8": "marlin_int8", } @@ -150,6 +154,19 @@ def create_linear_strategy(*, weight_dtype: str, act_dtype: str) -> LinearQuanti def registered_linear_dtypes() -> list[str]: """Return the normalized dtype/method names accepted by `_normalize_linear_dtype`.""" # Keep this list stable for CLI/help messages. - return ["bf16", "int8", "int4", "fp8_e4m3", "fp8_e5m2", "gptq", "awq", "gptq_awq", "marlin_int8"] + return [ + "bf16", + "int8", + "int4", + "fp8_e4m3", + "fp8_e5m2", + "gptq", + "gptq_marlin", + "gptq_marlin_24", + "awq", + "awq_marlin", + "gptq_awq", + "marlin_int8", + ] diff --git a/diffulex/utils/quantization/strategies/__init__.py b/diffulex/utils/quantization/strategies/__init__.py index d7cd5c1..1fcc216 100644 --- a/diffulex/utils/quantization/strategies/__init__.py +++ b/diffulex/utils/quantization/strategies/__init__.py @@ -8,14 +8,15 @@ from diffulex.utils.quantization.strategies.linear_bf16 import LinearBF16Strategy from diffulex.utils.quantization.strategies.linear_stub import LinearStubStrategy from diffulex.utils.quantization.strategies.linear_int8_w8a16 import LinearInt8W8A16Strategy # noqa: F401 -from diffulex.utils.quantization.strategies.linear_marlin_int8_w8a16 import LinearMarlinInt8W8A16Strategy # noqa: F401 from diffulex.utils.quantization.strategies.linear_int4_w4a16 import LinearInt4W4A16Strategy # noqa: F401 from diffulex.utils.quantization.strategies.linear_int8_w8a8 import LinearInt8W8A8Strategy # noqa: F401 from diffulex.utils.quantization.strategies.linear_int4_w4a8 import LinearInt4W4A8Strategy # noqa: F401 from diffulex.utils.quantization.strategies.linear_fp8_w8a16 import LinearFP8W8A16Strategy # noqa: F401 from diffulex.utils.quantization.strategies.linear_fp8_w8a8 import LinearFP8W8A8Strategy # noqa: F401 from diffulex.utils.quantization.strategies.linear_gptq_w4a16 import LinearGPTQW4A16Strategy # noqa: F401 +from diffulex.utils.quantization.strategies.linear_gptq_marlin_w4a16 import LinearGPTQMarlinW4A16Strategy # noqa: F401 from diffulex.utils.quantization.strategies.linear_awq_w4a16 import LinearAWQW4A16Strategy # noqa: F401 +from diffulex.utils.quantization.strategies.linear_awq_marlin_w4a16 import LinearAWQMarlinW4A16Strategy # noqa: F401 __all__ = [ 'NoQuantizationStrategy', @@ -24,13 +25,14 @@ 'LinearBF16Strategy', 'LinearStubStrategy', 'LinearInt8W8A16Strategy', - 'LinearMarlinInt8W8A16Strategy', 'LinearInt4W4A16Strategy', 'LinearInt8W8A8Strategy', 'LinearInt4W4A8Strategy', 'LinearFP8W8A16Strategy', 'LinearFP8W8A8Strategy', 'LinearGPTQW4A16Strategy', + 'LinearGPTQMarlinW4A16Strategy', 'LinearAWQW4A16Strategy', + 'LinearAWQMarlinW4A16Strategy', ] diff --git a/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py new file mode 100644 index 0000000..be9389f --- /dev/null +++ b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py @@ -0,0 +1,123 @@ +""" +AWQ Marlin (W4, A16) Linear strategy using vLLM Marlin CUDA kernels. + +- Input activations: bf16 (cast to fp16 for vLLM marlin kernel) +- Weights: offline AWQ vLLM standard format (qweight/qzeros/scales) +- One-time repack/permutation is performed by Diffulex `LinearBase` and passed in via kwargs: + - awq_marlin_qweight / awq_marlin_scales / awq_marlin_zp + - awq_marlin_workspace +""" + +from __future__ import annotations + +from typing import Any, Optional + +import torch + +from diffulex.utils.quantization.registry import register_linear_strategy +from diffulex.utils.quantization.strategy import LinearQuantizationStrategy + +try: + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore + apply_awq_marlin_linear, + marlin_make_empty_g_idx, + marlin_permute_bias, + ) + from vllm.scalar_type import scalar_types # type: ignore +except Exception: # pragma: no cover + apply_awq_marlin_linear = None # type: ignore + marlin_make_empty_g_idx = None # type: ignore + marlin_permute_bias = None # type: ignore + scalar_types = None # type: ignore + + +@register_linear_strategy(weight_dtype="awq_marlin", act_dtype="bf16") +def _build_linear_awq_marlin_w4a16() -> LinearQuantizationStrategy: + return LinearAWQMarlinW4A16Strategy() + + +class LinearAWQMarlinW4A16Strategy(LinearQuantizationStrategy): + @property + def name(self) -> str: + return "linear_awq_marlin_w4a16" + + @property + def linear_weight_format(self) -> str: + return "awq_marlin" + + @property + def linear_act_format(self) -> str: + return "bf16" + + def get_storage_dtype(self) -> tuple[torch.dtype, int]: + return torch.int32, 4 + + def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: + # Same as AWQ: [K/group, N] + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight shape, got {original_shape}") + out_features, in_features = original_shape + group_size = int(kwargs.get("group_size", 128)) + group_size = in_features if group_size == -1 else group_size + if group_size <= 0 or in_features % group_size != 0: + raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}") + num_groups = in_features // group_size + return (num_groups, out_features) + + def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: + return tensor, {} + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: + if quantized.is_floating_point(): + return quantized + raise NotImplementedError("AWQ Marlin 不提供 Python dequantize;请使用 vLLM Marlin CUDA kernel。") + + def linear_forward( + self, + x: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor], + *, + quant_kind: str, + **kwargs: Any, + ) -> torch.Tensor: + _ = quant_kind, weight + if apply_awq_marlin_linear is None or scalar_types is None: + raise RuntimeError("awq_marlin 需要 vLLM (marlin_utils + scalar_types);当前环境不可用。") + + qweight = kwargs.get("awq_marlin_qweight", None) + scales = kwargs.get("awq_marlin_scales", None) + zp = kwargs.get("awq_marlin_zp", None) + workspace = kwargs.get("awq_marlin_workspace", None) + in_features = int(kwargs.get("in_features", 0)) + out_features = int(kwargs.get("out_features", 0)) + + if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0: + raise RuntimeError("awq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).") + + # vLLM marlin kernels expect FP16 activations. + x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x + + # AWQ marlin does not use g_idx. + empty = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32) + + marlin_bias = None + if bias is not None: + marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias + + out = apply_awq_marlin_linear( + input=x_in, + weight=qweight, + weight_scale=scales, + weight_zp=zp, + g_idx=empty, + g_idx_sort_indices=empty, + workspace=workspace, + quant_type=scalar_types.uint4, + output_size_per_partition=out_features, + input_size_per_partition=in_features, + bias=marlin_bias, + input_dtype=None, + ) + return out.to(dtype=x.dtype) if out.dtype != x.dtype else out + diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py index 4d314a1..488176e 100644 --- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py @@ -1,11 +1,11 @@ """ -AWQ W4A16 Linear quantization strategy (AWQ weight + bf16 activation). +AWQ W4A16 Linear quantization strategy (vLLM standard format). -Implementation notes: -- Weight quantization: AWQ format with groupwise quantization -- Activation: kept as bf16 (no activation quantization) -- Storage: AWQ uses packed int4 weights (qweight), int4 zeros (qzeros), and per-group scales -- Forward path: Dequantize AWQ weights to bf16, then use F.linear +- Weight format: vLLM AWQ (packed int32 qweight/qzeros + fp16 scales) +- Activation: bf16 (no activation quantization) +- Forward: vLLM custom op `awq_gemm` (with the same heuristic as vLLM) + +No TileLang dependency. """ from __future__ import annotations @@ -18,161 +18,10 @@ from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -# Try to import TileLang kernel, fallback to None if not available -_TILELANG_AVAILABLE = False -try: - from diffulex_kernel.python.linear_kernels import awq_w4a16_gemm - _TILELANG_AVAILABLE = True -except ImportError: - awq_w4a16_gemm = None - try: - from diffulex.attention.metadata import is_warming_up - from tilelang.autotuner import set_autotune_inputs - _AUTOTUNE_AVAILABLE = True -except ImportError: - _AUTOTUNE_AVAILABLE = False - is_warming_up = lambda: False - set_autotune_inputs = lambda *args, **kwargs: lambda f: f - - -def _unpack_awq_int4( - packed: torch.Tensor, - *, - out_features: int, - in_features: int, -) -> torch.Tensor: - """Unpack AWQ packed int4 weights into int8 values. - - AWQ packs 2 int4 values per int8 byte: - - Lower 4 bits: even columns - - Upper 4 bits: odd columns - - Args: - packed: int8 tensor [out_features, (in_features + 1) // 2] - out_features: Original output features - in_features: Original input features - - Returns: - unpacked: int8 tensor [out_features, in_features] with values in [-8, 7] - """ - if packed.dtype != torch.int8: - raise TypeError(f"packed weight must be int8, got {packed.dtype}") - - out_features_actual, packed_in = packed.shape - expected_packed_in = (in_features + 1) // 2 - if packed_in != expected_packed_in: - raise ValueError( - f"Packed input dimension mismatch: got {packed_in}, " - f"expected {expected_packed_in} for in_features={in_features}" - ) - if out_features_actual != out_features: - raise ValueError( - f"Output dimension mismatch: got {out_features_actual}, " - f"expected {out_features}" - ) - - # Interpret bytes as uint8 for bit manipulation - p_u8 = packed.view(torch.uint8) - # Extract lower and upper 4 bits - low_u8 = (p_u8 & 0x0F) # [0..15] - high_u8 = ((p_u8 >> 4) & 0x0F) # [0..15] - - # Convert unsigned nibble [0..15] to signed int4 [-8..7] - # Packing: int4 [-8, 7] + 8 -> uint8 [0, 15] - # Unpacking: uint8 [0, 15] - 8 -> int4 [-8, 7] - low_s = low_u8.to(torch.int16) - 8 - high_s = high_u8.to(torch.int16) - 8 - - # Interleave low/high along in_features - unpacked = torch.empty((out_features, packed_in * 2), device=packed.device, dtype=torch.int16) - unpacked[:, 0::2] = low_s - unpacked[:, 1::2] = high_s - unpacked = unpacked[:, :in_features].to(torch.int8) - return unpacked - - -def _dequantize_awq( - qweight: torch.Tensor, - qzeros: torch.Tensor, - scales: torch.Tensor, - *, - out_features: int, - in_features: int, - group_size: int = 128, -) -> torch.Tensor: - """Dequantize AWQ weights to bf16. - - AWQ uses groupwise quantization: - - Weight is quantized per group (group_size consecutive output channels) - - Each group has its own scale and zero point - - AWQ does not use g_idx (sequential grouping) - - Args: - qweight: int8 tensor [out_features, (in_features + 1) // 2] packed int4 - qzeros: int8 tensor [(out_features + group_size - 1) // group_size, (in_features + 1) // 2] packed int4 - scales: float32 tensor [(out_features + group_size - 1) // group_size, in_features] or [num_groups] - out_features: Output features - in_features: Input features - group_size: Group size for quantization (default: 128) - - Returns: - dequantized: bf16 tensor [out_features, in_features] - """ - device = qweight.device - - # Unpack qweight to int8 [out_features, in_features] - w_int8 = _unpack_awq_int4(qweight, out_features=out_features, in_features=in_features) - - # Unpack qzeros to int8 [num_groups, in_features] - num_groups = (out_features + group_size - 1) // group_size - if qzeros.shape[0] != num_groups: - raise ValueError( - f"qzeros shape mismatch: got {qzeros.shape[0]} groups, " - f"expected {num_groups} for out_features={out_features}, group_size={group_size}" - ) - zeros_int8 = _unpack_awq_int4(qzeros, out_features=num_groups, in_features=in_features) - - # Ensure scales have correct shape [num_groups, in_features] - if scales.shape == (num_groups,): - # Broadcast per-group scales to all input features - scales = scales.unsqueeze(-1).expand(num_groups, in_features) # [num_groups, in_features] - elif scales.shape == (num_groups, 1): - scales = scales.expand(num_groups, in_features) # [num_groups, in_features] - elif scales.shape != (num_groups, in_features): - raise ValueError( - f"scales shape mismatch: got {scales.shape}, " - f"expected ({num_groups}, {in_features}), ({num_groups},), or ({num_groups}, 1)" - ) - - # Convert to float32 for dequantization - w_fp32 = w_int8.to(torch.float32) - zeros_int8_fp32 = zeros_int8.to(torch.float32) # Quantized zeros (int8) - scales_fp32 = scales.to(torch.float32) - - # Dequantize zeros: zero = zero_quantized * scale - # zeros_int8 was quantized as: zero_quantized = round(zero / scale) - # So to recover: zero = zero_quantized * scale - zeros_fp32 = zeros_int8_fp32 * scales_fp32 # [num_groups, in_features] - - # Dequantize: (weight - zero) * scale - # AWQ uses sequential grouping: group_id = out_idx // group_size - group_ids = torch.arange(out_features, device=device) // group_size # [out_features] - group_ids = group_ids.unsqueeze(-1) # [out_features, 1] - - # Gather zeros and scales for each output channel - zeros_for_channel = torch.gather( - zeros_fp32, 0, group_ids.expand(-1, in_features) - ) # [out_features, in_features] - scales_for_channel = torch.gather( - scales_fp32, 0, group_ids.expand(-1, in_features) - ) # [out_features, in_features] - - # Dequantize: quantized * scale + zero - # Quantization formula: quantized = round((weight - zero) / scale) - # Dequantization formula: weight = quantized * scale + zero - dequantized = w_fp32 * scales_for_channel + zeros_for_channel - return dequantized.to(torch.bfloat16) + from vllm import _custom_ops as ops # type: ignore +except Exception: # pragma: no cover + ops = None # type: ignore @register_linear_strategy(weight_dtype="awq", act_dtype="bf16") @@ -181,21 +30,6 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy: class LinearAWQW4A16Strategy(LinearQuantizationStrategy): - """AWQ W4A16 Linear strategy: AWQ weight quantization + bf16 activation. - - Current implementation: Python reference using dequantized weights + F.linear. - Weight quantization: AWQ format with groupwise quantization (typically group_size=128). - Activation: kept as bf16 (no activation quantization). - - Lazy cache: Dequantized weights are cached to avoid re-dequantizing on every forward pass. - """ - - def __init__(self): - """Initialize strategy (no cache needed when using kernel).""" - super().__init__() - # TileLang autotune config cache: (device, M_bucket, N, K, num_groups, group_size) -> config dict - self._tl_autotune_config_cache: dict[tuple[str, int, int, int, int, int], dict] = {} - @property def name(self) -> str: return "linear_awq_w4a16" @@ -209,99 +43,33 @@ def linear_act_format(self) -> str: return "bf16" def get_storage_dtype(self) -> tuple[torch.dtype, int]: - # AWQ weights are stored as packed int8 (2 int4 per byte) - return torch.int8, 1 + # vLLM AWQ stores packed weights in int32. + return torch.int32, 4 def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for AWQ groupwise quantization. - - For [out_features, in_features] weight with group_size groups: - - scales shape is [(out_features + group_size - 1) // group_size, in_features] - or [(out_features + group_size - 1) // group_size] (broadcasted) - """ - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - out_features, in_features = original_shape[0], original_shape[1] - group_size = kwargs.get("group_size", 128) - num_groups = (out_features + group_size - 1) // group_size - return (num_groups, in_features) - - def quantize(self, tensor: torch.Tensor, **kwargs): - """AWQ quantization is typically done offline, so this is a placeholder.""" + # vLLM AWQ scales: [K/group, N], where Linear weight is (N, K). + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight shape, got {original_shape}") + out_features, in_features = original_shape + group_size = int(kwargs.get("group_size", 128)) + group_size = in_features if group_size == -1 else group_size + if group_size <= 0 or in_features % group_size != 0: + raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}") + num_groups = in_features // group_size + return (num_groups, out_features) + + def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: + # Offline AWQ is handled by `diffulex.utils.quantization.quantize_model`. + return tensor, {} + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: + if quantized.is_floating_point(): + return quantized raise NotImplementedError( - "AWQ quantization should be done offline using AWQ tools. " - "This strategy only supports loading pre-quantized weights." + "AWQ dequantize is not implemented in Diffulex. " + "Use vLLM kernels via linear_forward." ) - def dequantize( - self, - quantized: torch.Tensor, - scale_or_metadata: Any, - **kwargs - ) -> torch.Tensor: - """Dequantize AWQ weights. - - Args: - quantized: Not used (kept for interface compatibility) - scale_or_metadata: Dict with keys: - - 'qweight': int8 packed int4 weights - - 'qzeros': int8 packed int4 zeros - - 'scales': float32 per-group scales - - 'out_features': int - - 'in_features': int - - 'group_size': int (default: 128) - **kwargs: Additional arguments - - Returns: - Dequantized tensor in bf16 - """ - if not isinstance(scale_or_metadata, dict): - raise ValueError( - "AWQ dequantize requires dict metadata with keys: " - "qweight, qzeros, scales, out_features, in_features, group_size (optional)" - ) - - qweight = scale_or_metadata["qweight"] - qzeros = scale_or_metadata["qzeros"] - scales = scale_or_metadata["scales"] - out_features = scale_or_metadata["out_features"] - in_features = scale_or_metadata["in_features"] - group_size = scale_or_metadata.get("group_size", 128) - - return _dequantize_awq( - qweight=qweight, - qzeros=qzeros, - scales=scales, - out_features=out_features, - in_features=in_features, - group_size=group_size, - ) - - def quantize_weight_for_kernel( - self, - weight: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """AWQ quantization is done offline, so this should not be called.""" - raise NotImplementedError( - "AWQ quantization should be done offline. " - "Use set_offline_quantized_weight() to load pre-quantized weights." - ) - - def quantize_act_for_kernel( - self, - x: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """No activation quantization for W4A16 (activation stays bf16).""" - if device is not None: - x = x.to(device=device) - return x, None - def linear_forward( self, x: torch.Tensor, @@ -311,199 +79,44 @@ def linear_forward( quant_kind: str, **kwargs: Any, ) -> torch.Tensor: - """Compute Linear output using AWQ quantized weights (W4A16). - - Args: - x: Activation tensor [M, K] (bf16) - weight: Either bf16 weight [N, K] (fallback) or AWQ metadata dict - bias: Optional bias tensor [N] - quant_kind: Quantization kind (unused) - **kwargs: May include: - - awq_qweight: int8 packed int4 weights [N, (K+1)//2] - - awq_qzeros: int8 packed int4 zeros [num_groups, (K+1)//2] - - awq_scales: float32 scales [num_groups, K] or [num_groups] - - awq_group_size: int (default: 128) - - out_features: int (N) - - in_features: int (K) - """ - _ = quant_kind - - # Check if AWQ tensors are provided directly via kwargs - qweight = kwargs.pop("awq_qweight", None) - qzeros = kwargs.pop("awq_qzeros", None) - scales = kwargs.pop("awq_scales", None) - group_size = kwargs.pop("awq_group_size", 128) - out_features = kwargs.pop("out_features", None) - in_features = kwargs.pop("in_features", None) - - # If AWQ tensors are provided, use them - if qweight is not None and qzeros is not None and scales is not None: - if out_features is None or in_features is None: - # Infer from x shape - M, K = x.shape - if in_features is None: - in_features = K - if out_features is None: - # Infer from qweight shape - out_features = qweight.shape[0] - - M, K = x.shape - N = out_features - num_groups = (N + group_size - 1) // group_size - - # Handle scales shape: broadcast to [num_groups, in_features] if needed - if scales.shape == (num_groups,): - scales = scales.unsqueeze(-1).expand(num_groups, in_features) - elif scales.shape == (num_groups, 1): - scales = scales.expand(num_groups, in_features) - elif scales.shape != (num_groups, in_features): - raise ValueError( - f"scales shape mismatch: got {scales.shape}, " - f"expected ({num_groups}, {in_features}), ({num_groups},), or ({num_groups}, 1)" - ) - - # Ensure all tensors are on the correct device - qweight = qweight.to(device=x.device) - qzeros = qzeros.to(device=x.device) - scales = scales.to(device=x.device, dtype=torch.float32) - - # Try to use TileLang kernel if available - if _TILELANG_AVAILABLE and awq_w4a16_gemm is not None: - try: - # Check device - if x.device.type != 'cuda': - return self._fallback_python_forward( - x, qweight, qzeros, scales, bias, - out_features=N, in_features=in_features, - group_size=group_size, - ) - - # M-bucketing: reduce JIT compilation churn - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_for_kernel = x - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype) - x_pad[:M, :] = x - x_for_kernel = x_pad - - # TileLang autotune: use warmup + config cache pattern - cache_key = (str(x.device), M_bucket, N, K, num_groups, group_size) - config = self._tl_autotune_config_cache.get(cache_key) - - if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None: - # Warmup phase: run autotune with real inputs - try: - with set_autotune_inputs([x_for_kernel, qweight, qzeros, scales]): - kernel = awq_w4a16_gemm(M_bucket, N, K, num_groups, group_size) - config = kernel.config - self._tl_autotune_config_cache[cache_key] = config - except Exception: - # Fallback to default config if autotune fails - config = None - - # Use cached config or default parameters - if config is not None: - kernel = awq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, **config) - else: - # Default config (backward compatible) - kernel = awq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128) - - # Call kernel - out_idx=[4] means output is the 5th parameter - output_full = kernel(x_for_kernel, qweight, qzeros, scales) - output = output_full[:M, :] if M_bucket != M else output_full + _ = quant_kind, weight + if ops is None: + raise RuntimeError( + "vLLM is required for AWQ W4A16 (missing `vllm._custom_ops`). " + "Please install/build vLLM with CUDA ops." + ) - # Add bias if present - if bias is not None: - output = output + bias + qweight = kwargs.get("awq_qweight", None) + qzeros = kwargs.get("awq_qzeros", None) + scales = kwargs.get("awq_scales", None) - return output - except Exception as e: - # Fallback to Python implementation on any error - import warnings - error_msg = str(e) + if qweight is None or qzeros is None or scales is None: + return F.linear(x, weight, bias) - # Extract meaningful error information - if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg): - # CUDA architecture not supported - silently fallback - pass - elif 'Compilation error' in error_msg: - # Extract the actual error - idx = error_msg.find('Compilation error') - after = error_msg[idx + len('Compilation error'):] - lines = after.split('\n') - for line in lines: - line = line.strip() - if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()): - error_msg = f"CUDA compilation error: {line[:200]}" - break - else: - error_msg = "CUDA compilation error (see logs for details)" - warnings.warn( - f"TileLang AWQ kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower(): - # Pipeline stages mismatch - silently fallback - pass - else: - # Warn for unexpected errors - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - warnings.warn( - f"TileLang AWQ kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - return self._fallback_python_forward( - x, qweight, qzeros, scales, bias, - out_features=N, in_features=in_features, - group_size=group_size, - ) - else: - # TileLang not available, use Python fallback - return self._fallback_python_forward( - x, qweight, qzeros, scales, bias, - out_features=N, in_features=in_features, - group_size=group_size, - ) + # Infer pack_factor from packed shapes to avoid hard-coding 4-bit. + # AWQ: qweight [K, N/pack], scales [K/group, N] + if scales.ndim != 2 or scales.shape[1] <= 0: + raise RuntimeError(f"Invalid AWQ scales shape: {tuple(scales.shape)}") + if qweight.shape[1] <= 0 or int(scales.shape[1]) % int(qweight.shape[1]) != 0: + raise RuntimeError( + f"Invalid AWQ packed shapes: qweight.shape={tuple(qweight.shape)}, " + f"scales.shape={tuple(scales.shape)}" + ) + pack_factor = int(scales.shape[1]) // int(qweight.shape[1]) + # vLLM AWQ kernels expect FP16 activations. + x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x + qweight = qweight.to(device=x.device, dtype=torch.int32) + qzeros = qzeros.to(device=x.device, dtype=torch.int32) + scales = scales.to(device=x.device, dtype=torch.float16) - # Fallback: if weight is a regular bf16 tensor, use it directly - if isinstance(weight, torch.Tensor) and weight.dtype == torch.bfloat16: - return F.linear(x, weight, bias) + out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,) + reshaped_x = x_in.reshape(-1, x_in.shape[-1]) - raise ValueError( - "AWQ strategy requires awq_qweight, awq_qzeros, and awq_scales to be provided " - "via kwargs or weight must be a bf16 tensor (fallback mode)" - ) + # Always use awq_gemm to avoid large temporary dequantized weight allocations. + out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, pack_factor) - def _fallback_python_forward( - self, - x: torch.Tensor, - qweight: torch.Tensor, - qzeros: torch.Tensor, - scales: torch.Tensor, - bias: Optional[torch.Tensor], - *, - out_features: int, - in_features: int, - group_size: int, - ) -> torch.Tensor: - """Fallback Python implementation: dequantize + F.linear.""" - dequant_weight = _dequantize_awq( - qweight=qweight.to(device=x.device), - qzeros=qzeros.to(device=x.device), - scales=scales.to(device=x.device), - out_features=out_features, - in_features=in_features, - group_size=group_size, - ) - return F.linear(x, dequant_weight, bias) + if bias is not None: + out.add_(bias.to(dtype=out.dtype)) + out = out.reshape(out_shape) + return out.to(dtype=x.dtype) if out.dtype != x.dtype else out - def clear_cache(self) -> None: - """Clear cache (no-op, kept for compatibility).""" - pass diff --git a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py index 2e2cf1f..85048d8 100644 --- a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py +++ b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py @@ -1,12 +1,13 @@ """ -FP8 W8A16 Linear quantization strategy (FP8 weight + bf16 activation). +FP8 W8A16 Linear quantization strategy (FP8 weight + bf16 activation), TileLang-free. -Implementation notes: -- Weight quantization: per-output-channel FP8 quantization (fp8_e4m3 or fp8_e5m2) -- Activation: kept as bf16 (no activation quantization) -- Storage: FP8 weights use uint8 storage + view(fp8_dtype) pattern -- Scale management: per-channel weight scales (shape: [out_features]), dtype: float32 -- Forward path: Python fallback (dequantize FP8 weight → bf16, then F.linear) +vLLM-aligned implementation: +- Weight quantization: `vllm._custom_ops.scaled_fp8_quant` (FP8 weight + per-tensor scale). +- Forward: use vLLM's `Fp8LinearOp` (CUTLASS scaled_mm when available). + +Note: +- vLLM 的 FP8 linear 核心路径以 e4m3 为主(由 vLLM 当前平台决定的 fp8 dtype)。 +- 为了避免“静默走慢路径”,这里不再使用 `F.linear` 的反量化 GEMM。 """ from __future__ import annotations @@ -14,40 +15,9 @@ from typing import Any, Optional import torch -import torch.nn.functional as F from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -from diffulex.utils.quantization.kv_cache_dtype import ( - parse_kv_cache_dtype, - _get_fp8_e4m3_dtype, - _get_fp8_e5m2_dtype, -) - -# Try to import TileLang kernels, fallback to None if not available -_TILELANG_AVAILABLE = False -_fp8_e4m3_w8a16_gemm = None -_fp8_e5m2_w8a16_gemm = None - -try: - from diffulex_kernel.python.linear_kernels import ( - fp8_e4m3_w8a16_gemm, - fp8_e5m2_w8a16_gemm, - ) - _TILELANG_AVAILABLE = True - _fp8_e4m3_w8a16_gemm = fp8_e4m3_w8a16_gemm - _fp8_e5m2_w8a16_gemm = fp8_e5m2_w8a16_gemm -except ImportError: - pass - -try: - from diffulex.attention.metadata import is_warming_up - from tilelang.autotuner import set_autotune_inputs - _AUTOTUNE_AVAILABLE = True -except ImportError: - _AUTOTUNE_AVAILABLE = False - is_warming_up = lambda: False - set_autotune_inputs = lambda *args, **kwargs: lambda f: f @register_linear_strategy(weight_dtype="fp8_e4m3", act_dtype="bf16") @@ -61,166 +31,76 @@ def _build_linear_fp8_e5m2_w8a16() -> LinearQuantizationStrategy: class LinearFP8W8A16Strategy(LinearQuantizationStrategy): - """FP8 W8A16 Linear strategy: FP8 weight quantization + bf16 activation. - - Current implementation: Python reference using dequantized weights + F.linear. - Weight quantization: per-output-channel FP8 quantization (fp8_e4m3 or fp8_e5m2). - Activation: kept as bf16 (no activation quantization). - - Lazy cache: Quantized weights are cached per weight tensor (by id) to avoid - re-quantizing on every forward pass. - """ - - def __init__(self, weight_dtype: str = "fp8_e4m3"): - """ - Initialize FP8 W8A16 strategy. - - Args: - weight_dtype: FP8 dtype string ("fp8_e4m3" or "fp8_e5m2") - """ + def __init__(self, weight_dtype: str = "fp8_e4m3") -> None: super().__init__() self.weight_dtype_str = weight_dtype - self.spec = parse_kv_cache_dtype(weight_dtype) - if not self.spec.is_fp8: - raise ValueError(f"Expected FP8 dtype, got {weight_dtype}") - - # Cache: weight_id -> (quantized_weight_uint8, scales_float32) - # Using id(weight) as key since the same Parameter object is reused across forwards + # Cache: id(weight) -> (q_fp8_KN [K,N], scale_fp32 [1]) self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} - # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory) - self._dequant_weight_cache: dict[int, torch.Tensor] = {} - # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict - self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {} - + + try: + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( # type: ignore + Fp8LinearOp, + ) + except Exception as e: # pragma: no cover + raise RuntimeError("FP8 需要 vLLM(Fp8LinearOp / _custom_ops)。") from e + + # dynamic activation quantization to FP8 inside vLLM + self._fp8_linear = Fp8LinearOp(act_quant_static=False) + @property def name(self) -> str: return f"linear_fp8_{self.weight_dtype_str}_w8a16" - + @property def linear_weight_format(self) -> str: return self.weight_dtype_str - + @property def linear_act_format(self) -> str: return "bf16" - + def get_storage_dtype(self) -> tuple[torch.dtype, int]: - # FP8 weights are stored as uint8 (1 byte per element) + # vLLM stores fp8 weights as float8 dtype tensor return torch.uint8, 1 - - def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: - """Quantize tensor to FP8 with per-channel (per-output) scales. - - Args: - tensor: Weight tensor of shape [out_features, in_features] - **kwargs: Additional arguments (unused for now) - - Returns: - (quantized_tensor_uint8, scales_float32): quantized_tensor is uint8 (FP8 storage), - scales is [out_features] - """ - _ = kwargs - assert self.spec.fp8_view_dtype is not None - assert self.spec.fp8_min is not None and self.spec.fp8_max is not None - - # Per-output-channel quantization: compute scale for each output channel - # shape: [out_features, in_features] -> scales shape: [out_features] - abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0] # [out_features, 1] - eps = 1e-8 - fp8_max = float(self.spec.fp8_max) - - # Compute scales: abs_max / fp8_max - scales = (abs_max.clamp(min=eps) / fp8_max).to(torch.float32) # [out_features, 1] - - # Quantize: clamp(tensor / scale, fp8_min, fp8_max).to(fp8_dtype).view(uint8) - descale = 1.0 / scales # [out_features, 1] - quantized = (tensor.to(torch.float32) * descale).clamp( - min=float(self.spec.fp8_min), - max=float(self.spec.fp8_max) - ) - quantized_fp8 = quantized.to(self.spec.fp8_view_dtype) - quantized_uint8 = quantized_fp8.view(torch.uint8) - - scales_1d = scales.squeeze(-1) # [out_features] - - return quantized_uint8, scales_1d - - def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: - """Dequantize FP8 tensor back to bf16 using per-channel scales. - - Args: - quantized: uint8 tensor [out_features, in_features] (FP8 storage) - scale_or_metadata: scales tensor [out_features] or dict with 'scales' - **kwargs: Additional arguments (unused for now) - - Returns: - Dequantized tensor in bf16 - """ + + def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]: _ = kwargs - assert self.spec.fp8_view_dtype is not None - - if isinstance(scale_or_metadata, dict): - scales = scale_or_metadata.get("scales") - else: - scales = scale_or_metadata - - if scales is None: - raise ValueError("scales required for dequantization") - - # View uint8 as FP8 dtype - fp8_tensor = quantized.view(self.spec.fp8_view_dtype).to(torch.float32) - - # Ensure scales have correct shape for broadcasting - if scales.dim() == 1: - scales = scales.unsqueeze(-1) # [out_features, 1] - - # Dequantize: quantized * scales - dequantized = fp8_tensor * scales.to(torch.float32) - return dequantized.to(torch.bfloat16) - - def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for per-channel quantization. - - For [out_features, in_features] weight, scales shape is [out_features]. - """ + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight [N,K], got {original_shape}") + # per-tensor scale + return (1,) + + def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]: _ = kwargs - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - # Per-output-channel: scales shape is [out_features] - return (original_shape[0],) - + if tensor.dim() != 2: + raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}") + from vllm import _custom_ops as ops # type: ignore + from vllm.platforms import current_platform # type: ignore + + # vLLM: per-tensor scale, output dtype = current_platform.fp8_dtype() + q_fp8, scale = ops.scaled_fp8_quant(tensor.to(torch.float32).contiguous(), scale=None) + # Keep transpose-view for CUTLASS expectation (b.stride(0) == 1). + q_kn_fp8 = q_fp8.t() # [K,N] fp8 dtype, non-contiguous + scale = scale.to(torch.float32).reshape(1).contiguous() + return q_kn_fp8, {"scales": scale, "fp8_dtype": current_platform.fp8_dtype()} + def quantize_weight_for_kernel( self, weight: torch.Tensor, *, device: torch.device | None = None, - **kwargs: Any, + **_: Any, ) -> tuple[torch.Tensor, Any]: - """Quantize weight to FP8 with per-channel scales. - - Returns: - (quantized_weight_uint8, scales_float32): quantized_weight is uint8 [out, in], - scales is float32 [out] - """ - _ = kwargs + q_fp8, meta = self.quantize(weight) if device is not None: - weight = weight.to(device=device) - - quantized, scales = self.quantize(weight) - return quantized, scales - - def quantize_act_for_kernel( - self, - x: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """No activation quantization for W8A16 (activation stays bf16).""" - if device is not None: - x = x.to(device=device) - return x, None - + q_fp8 = q_fp8.to(device=device) + meta["scales"] = meta["scales"].to(device=device) + return q_fp8, meta["scales"] + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor: + _ = kwargs + raise RuntimeError("FP8 不提供 dequantize 路径(避免走慢的反量化 + F.linear)。") + def linear_forward( self, x: torch.Tensor, @@ -230,184 +110,33 @@ def linear_forward( quant_kind: str, **kwargs: Any, ) -> torch.Tensor: - """Compute Linear output using quantized FP8 weights (W8A16). - - Uses Python reference implementation (dequant + F.linear). - Future: can integrate TileLang kernel if available. - """ _ = quant_kind - - # If caller provides a pre-quantized uint8 weight + scales (e.g., load-time quantized module), - # use them directly and DO NOT populate the lazy cache (to avoid double-storage). - quant_scales = kwargs.pop("quant_scales", None) - if weight.dtype == torch.uint8: - if quant_scales is None: - raise ValueError("weight is uint8 (FP8) but quant_scales is None; expected per-channel scales tensor") - quantized_weight = weight - scales = quant_scales - if scales.dtype != torch.float32: - scales = scales.to(dtype=torch.float32) - if quantized_weight.device != x.device: - quantized_weight = quantized_weight.to(device=x.device) - if scales.device != x.device: - scales = scales.to(device=x.device) - else: - # Lazy cache: use weight tensor id as key (only for bf16/fp16/fp32 weights) - weight_id = id(weight) - - # Check cache - if weight_id in self._weight_cache: - quantized_weight, scales = self._weight_cache[weight_id] - # Ensure cached tensors are on the correct device - if quantized_weight.device != x.device: - quantized_weight = quantized_weight.to(device=x.device) - scales = scales.to(device=x.device) - else: - # Quantize weight and cache it - quantized_weight, scales = self.quantize_weight_for_kernel(weight, device=x.device) - # Cache the quantized weight and scales - self._weight_cache[weight_id] = (quantized_weight, scales) - - # Speed-first option: cache dequantized bf16 weight for F.linear (cuBLAS) - # This trades extra GPU memory for throughput. - import os - if os.getenv("DIFFULEX_FP8_W8A16_PREFER_CUBLAS", "0") == "1": - deq_key = id(weight) if weight.dtype != torch.uint8 else id(quantized_weight) - deq_w = self._dequant_weight_cache.get(deq_key) - if deq_w is None or deq_w.device != x.device: - # Dequantize: FP8[N,K] * scales[N] -> bf16[N,K] - deq_w = self.dequantize(quantized_weight, scales) - self._dequant_weight_cache[deq_key] = deq_w - return F.linear(x, deq_w, bias) - - # Try to use TileLang kernel if available - fp8_w8a16_gemm = None - if self.weight_dtype_str == "fp8_e4m3": - fp8_w8a16_gemm = _fp8_e4m3_w8a16_gemm - elif self.weight_dtype_str == "fp8_e5m2": - fp8_w8a16_gemm = _fp8_e5m2_w8a16_gemm - - if _TILELANG_AVAILABLE and fp8_w8a16_gemm is not None: - try: - # Check device - if x.device.type != 'cuda': - return self._fallback_python_forward(x, quantized_weight, scales, bias) - - # Get shapes - M, K = x.shape - N, K_w = quantized_weight.shape - assert K == K_w, f"K dimension mismatch: {K} != {K_w}" - - # Bucket M to reduce compilation churn - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_for_kernel = x - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype) - x_pad[:M, :] = x - x_for_kernel = x_pad + from vllm.platforms import current_platform # type: ignore - # TileLang autotune: use warmup + config cache pattern - cache_key = (str(x.device), M_bucket, N, K) - config = self._tl_autotune_config_cache.get(cache_key) - - if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None: - # Warmup phase: run autotune with real inputs - try: - assert self.spec.fp8_view_dtype is not None - qweight_fp8 = quantized_weight.view(self.spec.fp8_view_dtype) - with set_autotune_inputs([x_for_kernel, qweight_fp8, scales]): - kernel = fp8_w8a16_gemm(M_bucket, N, K) - config = kernel.config - self._tl_autotune_config_cache[cache_key] = config - except Exception: - # Fallback to default config if autotune fails - config = None - - # Use cached config or default parameters - assert self.spec.fp8_view_dtype is not None - qweight_fp8 = quantized_weight.view(self.spec.fp8_view_dtype) - if config is not None: - kernel = fp8_w8a16_gemm(M_bucket, N, K, **config) - else: - # Default config (backward compatible) - kernel = fp8_w8a16_gemm(M_bucket, N, K, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128) - - # Call kernel - out_idx=[3] means output is the 4th parameter - assert self.spec.fp8_view_dtype is not None - qweight_fp8 = quantized_weight.view(self.spec.fp8_view_dtype) - output_full = kernel(x_for_kernel, qweight_fp8, scales) - output = output_full[:M, :] if M_bucket != M else output_full - - # Add bias if present - if bias is not None: - output = output + bias - - return output - except Exception as e: - # Fallback to Python implementation on any error - import warnings - error_msg = str(e) - - # Extract meaningful error information - if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg): - # CUDA architecture not supported - silently fallback - pass - elif 'Compilation error' in error_msg: - # Extract the actual error - idx = error_msg.find('Compilation error') - after = error_msg[idx + len('Compilation error'):] - lines = after.split('\n') - for line in lines: - line = line.strip() - if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()): - error_msg = f"CUDA compilation error: {line[:200]}" - break - else: - error_msg = "CUDA compilation error (see logs for details)" - elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower(): - # Pipeline stages mismatch - silently fallback - pass - else: - # Truncate very long error messages - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - - # Only warn for unexpected errors - if 'CUDA architecture not supported' not in error_msg and 'sm_' not in error_msg and 'Pipeline stages' not in error_msg: - warnings.warn( - f"TileLang kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - return self._fallback_python_forward(x, quantized_weight, scales, bias) + quant_scales = kwargs.get("quant_scales", None) + if weight is not None and quant_scales is not None: + # Expected: weight is fp8 K×N tensor (transpose-view is fine). + q_kn = weight.to(device=x.device) + scales = quant_scales.to(device=x.device, dtype=torch.float32).reshape(1) else: - # TileLang not available, use Python reference - return self._fallback_python_forward(x, quantized_weight, scales, bias) - - def _fallback_python_forward( - self, - x: torch.Tensor, - quantized_weight: torch.Tensor, - scales: torch.Tensor, - bias: Optional[torch.Tensor], - ) -> torch.Tensor: - """Fallback Python implementation: dequantize + F.linear.""" - # Dequantize for reference implementation - dequantized_weight = self.dequantize(quantized_weight, scales) - - # Compute linear output - return F.linear(x, dequantized_weight, bias) - - def clear_cache(self) -> None: - """Clear the weight quantization cache. - - Useful for memory management or when weights are updated (e.g., fine-tuning). - """ - self._weight_cache.clear() - self._dequant_weight_cache.clear() + wid = id(weight) + cached = self._weight_cache.get(wid) + if cached is None or cached[0].device != x.device: + q_fp8, meta = self.quantize(weight) + q_fp8 = q_fp8.to(device=x.device) + scales = meta["scales"].to(device=x.device, dtype=torch.float32).reshape(1) + q_kn = q_fp8 + self._weight_cache[wid] = (q_fp8, scales) + else: + q_kn, scales = cached + + # vLLM Fp8LinearOp expects weight as [K,N] fp8 tensor and per-tensor scale. + return self._fp8_linear.apply( + input=x, + weight=q_kn, + weight_scale=scales, + out_dtype=x.dtype if x.dtype in (torch.bfloat16, torch.float16) else torch.bfloat16, + input_scale=None, + bias=bias, + ) diff --git a/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py b/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py index 73c7965..d7f48c6 100644 --- a/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py +++ b/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py @@ -1,14 +1,9 @@ """ -FP8 W8A8 Linear quantization strategy (FP8 weight + FP8 activation). +FP8 W8A8 Linear quantization strategy (FP8 weight + FP8 activation), TileLang-free. -Implementation notes: -- Weight quantization: per-output-channel FP8 quantization (fp8_e4m3 or fp8_e5m2) -- Activation quantization: per-row FP8 quantization -- Storage: FP8 weights and activations use uint8 storage + view(fp8_dtype) pattern -- Scale management: - - Weight scales: per-channel [out_features], dtype: float16 - - Activation scales: per-row [M], dtype: float32 -- Forward path: Python fallback (dequantize both FP8 weight and activation → bf16, then F.linear) +vLLM-aligned implementation: +- Weight quantization: `vllm._custom_ops.scaled_fp8_quant` (per-tensor scale). +- Activation quantization + GEMM: vLLM `Fp8LinearOp` (CUTLASS scaled_mm when available). """ from __future__ import annotations @@ -16,75 +11,19 @@ from typing import Any, Optional import torch -import torch.nn.functional as F from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -from diffulex.utils.quantization.kv_cache_dtype import ( - parse_kv_cache_dtype, - _get_fp8_e4m3_dtype, - _get_fp8_e5m2_dtype, -) -# Try to import TileLang kernels, fallback to None if not available -_TILELANG_AVAILABLE = False -_fp8_e4m3_w8a8_gemm = None -_fp8_e5m2_w8a8_gemm = None -try: - from diffulex_kernel.python.linear_kernels import ( - fp8_e4m3_w8a8_gemm, - fp8_e5m2_w8a8_gemm, - ) - _TILELANG_AVAILABLE = True - _fp8_e4m3_w8a8_gemm = fp8_e4m3_w8a8_gemm - _fp8_e5m2_w8a8_gemm = fp8_e5m2_w8a8_gemm -except ImportError: - pass - -try: - from diffulex.attention.metadata import is_warming_up - from tilelang.autotuner import set_autotune_inputs - _AUTOTUNE_AVAILABLE = True -except ImportError: - _AUTOTUNE_AVAILABLE = False - is_warming_up = lambda: False - set_autotune_inputs = lambda *args, **kwargs: lambda f: f - - -def _quantize_per_row_fp8( - x: torch.Tensor, - fp8_view_dtype: torch.dtype, - fp8_min: float, - fp8_max: float, -) -> tuple[torch.Tensor, torch.Tensor]: - """Per-row symmetric FP8 quantization. - - Args: - x: Input tensor [M, K] in bf16/fp16/fp32 - fp8_view_dtype: FP8 dtype (e.g., torch.float8_e4m3fn) - fp8_min: Minimum FP8 value - fp8_max: Maximum FP8 value - - Returns: - x_q: uint8 [M, K] (FP8 storage) - x_scales: float32 [M] where dequant is x_q.view(fp8_dtype).float() * x_scales[:, None] - """ - # x: [M, K] - abs_max = x.abs().amax(dim=-1, keepdim=False) # [M] - eps = 1e-8 - scales = (abs_max.clamp(min=eps) / fp8_max).to(torch.float32) # [M] - - # Quantize: clamp(x / scale, fp8_min, fp8_max).to(fp8_dtype).view(uint8) - descale = 1.0 / scales.unsqueeze(-1) # [M, 1] - quantized = (x.to(torch.float32) * descale).clamp( - min=fp8_min, - max=fp8_max - ) - quantized_fp8 = quantized.to(fp8_view_dtype) - quantized_uint8 = quantized_fp8.view(torch.uint8) - - return quantized_uint8, scales +def _require_fp8_linear_op(): + try: + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( # type: ignore + Fp8LinearOp, + ) + except Exception as e: # pragma: no cover + raise RuntimeError("FP8 需要 vLLM(Fp8LinearOp / _custom_ops)。") from e + return Fp8LinearOp @register_linear_strategy(weight_dtype="fp8_e4m3", act_dtype="fp8_e4m3") @@ -98,189 +37,65 @@ def _build_linear_fp8_e5m2_w8a8() -> LinearQuantizationStrategy: class LinearFP8W8A8Strategy(LinearQuantizationStrategy): - """FP8 W8A8 Linear strategy: FP8 weight + FP8 activation quantization, output bf16. - - Current implementation: Python reference using dequantized weights and activations + F.linear. - Weight quantization: per-output-channel FP8 quantization. - Activation quantization: per-row FP8 quantization. - """ - - def __init__(self, weight_dtype: str = "fp8_e4m3", act_dtype: str = "fp8_e4m3"): - """ - Initialize FP8 W8A8 strategy. - - Args: - weight_dtype: FP8 dtype string for weights ("fp8_e4m3" or "fp8_e5m2") - act_dtype: FP8 dtype string for activations ("fp8_e4m3" or "fp8_e5m2") - """ + def __init__(self, weight_dtype: str = "fp8_e4m3", act_dtype: str = "fp8_e4m3") -> None: super().__init__() self.weight_dtype_str = weight_dtype self.act_dtype_str = act_dtype - self.weight_spec = parse_kv_cache_dtype(weight_dtype) - self.act_spec = parse_kv_cache_dtype(act_dtype) - if not self.weight_spec.is_fp8 or not self.act_spec.is_fp8: - raise ValueError(f"Expected FP8 dtypes, got weight={weight_dtype}, act={act_dtype}") - - # Cache: weight_id -> (quantized_weight_uint8, scales_float16) + # Cache: id(weight) -> (q_fp8_KN [K,N], scale_fp32 [1]) self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} - # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory) - self._dequant_weight_cache: dict[int, torch.Tensor] = {} - # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict - self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {} - + Fp8LinearOp = _require_fp8_linear_op() + self._fp8_linear = Fp8LinearOp(act_quant_static=False) + @property def name(self) -> str: return f"linear_fp8_{self.weight_dtype_str}_w8a8" - + @property def linear_weight_format(self) -> str: return self.weight_dtype_str - + @property def linear_act_format(self) -> str: return self.act_dtype_str - + def get_storage_dtype(self) -> tuple[torch.dtype, int]: - # FP8 weights are stored as uint8 (1 byte per element) return torch.uint8, 1 - - def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for per-channel quantization. - - For [out_features, in_features] weight, scales shape is [out_features]. - """ - _ = kwargs - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - # Per-output-channel: scales shape is [out_features] - return (original_shape[0],) - - def clear_cache(self) -> None: - self._weight_cache.clear() - self._dequant_weight_cache.clear() - - def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: - """Quantize tensor to FP8 with per-channel (per-output) scales. - - Args: - tensor: Weight tensor of shape [out_features, in_features] - **kwargs: Additional arguments (unused for now) - - Returns: - (quantized_tensor_uint8, scales_float16): quantized_tensor is uint8 (FP8 storage), - scales is float16 [out_features] - """ + + def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]: _ = kwargs - assert self.weight_spec.fp8_view_dtype is not None - assert self.weight_spec.fp8_min is not None and self.weight_spec.fp8_max is not None - - # Per-output-channel quantization: compute scale for each output channel - # shape: [out_features, in_features] -> scales shape: [out_features] - abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0] # [out_features, 1] - eps = 1e-8 - fp8_max = float(self.weight_spec.fp8_max) - - # Compute scales: abs_max / fp8_max - # Use float16 for weight scales (W8A8 paths are sensitive to scale precision) - scales = (abs_max.clamp(min=eps) / fp8_max).to(torch.float16) # [out_features, 1] - - # Quantize: clamp(tensor / scale, fp8_min, fp8_max).to(fp8_dtype).view(uint8) - descale = 1.0 / scales # [out_features, 1] - quantized = (tensor.to(torch.float32) * descale).clamp( - min=float(self.weight_spec.fp8_min), - max=float(self.weight_spec.fp8_max) - ) - quantized_fp8 = quantized.to(self.weight_spec.fp8_view_dtype) - quantized_uint8 = quantized_fp8.view(torch.uint8) - - scales_1d = scales.squeeze(-1) # [out_features] - - return quantized_uint8, scales_1d - - def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: - """Dequantize FP8 tensor back to bf16 using per-channel scales. - - Args: - quantized: uint8 tensor [out_features, in_features] (FP8 storage) - scale_or_metadata: scales tensor [out_features] or dict with 'scales' - **kwargs: Additional arguments (unused for now) - - Returns: - Dequantized tensor in bf16 - """ + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight [N,K], got {original_shape}") + return (1,) + + def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]: _ = kwargs - assert self.weight_spec.fp8_view_dtype is not None - - if isinstance(scale_or_metadata, dict): - scales = scale_or_metadata.get("scales") - else: - scales = scale_or_metadata - - if scales is None: - raise ValueError("scales required for dequantization") - - # View uint8 as FP8 dtype - fp8_tensor = quantized.view(self.weight_spec.fp8_view_dtype).to(torch.float32) - - # Ensure scales have correct shape for broadcasting - if scales.dim() == 1: - scales = scales.unsqueeze(-1) # [out_features, 1] - - # Dequantize: quantized * scales - dequantized = fp8_tensor * scales.to(torch.float32) - return dequantized.to(torch.bfloat16) - + if tensor.dim() != 2: + raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}") + from vllm import _custom_ops as ops # type: ignore + from vllm.platforms import current_platform # type: ignore + + q_fp8, scale = ops.scaled_fp8_quant(tensor.to(torch.float32).contiguous(), scale=None) + q_kn_fp8 = q_fp8.t() # [K,N], stride(0)==1 + scale = scale.to(torch.float32).reshape(1).contiguous() + return q_kn_fp8, {"scales": scale, "fp8_dtype": current_platform.fp8_dtype()} + def quantize_weight_for_kernel( self, weight: torch.Tensor, *, device: torch.device | None = None, - **kwargs: Any, + **_: Any, ) -> tuple[torch.Tensor, Any]: - """Quantize weight to FP8 with per-channel scales. - - Returns: - (quantized_weight_uint8, scales_float16): quantized_weight is uint8 [out, in], - scales is float16 [out] - """ - _ = kwargs + q_fp8, meta = self.quantize(weight) if device is not None: - weight = weight.to(device=device) - - quantized, scales = self.quantize(weight) - return quantized, scales - - def quantize_act_for_kernel( - self, - x: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """Quantize activation to FP8 with per-row scales. - - Returns: - (quantized_act_uint8, scales_float32): quantized_act is uint8 [M, K], - scales is float32 [M] - """ - if device is not None: - x = x.to(device=device) - - assert self.act_spec.fp8_view_dtype is not None - assert self.act_spec.fp8_min is not None and self.act_spec.fp8_max is not None - - # Ensure input is in a compatible dtype - if x.dtype not in (torch.bfloat16, torch.float16, torch.float32): - x = x.to(torch.bfloat16) - - quantized, scales = _quantize_per_row_fp8( - x, - self.act_spec.fp8_view_dtype, - float(self.act_spec.fp8_min), - float(self.act_spec.fp8_max), - ) - return quantized, scales - + q_fp8 = q_fp8.to(device=device) + meta["scales"] = meta["scales"].to(device=device) + return q_fp8, meta["scales"] + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor: + _ = kwargs + raise RuntimeError("FP8 不提供 dequantize 路径(避免走慢的反量化 + F.linear)。") + def linear_forward( self, x: torch.Tensor, @@ -290,218 +105,25 @@ def linear_forward( quant_kind: str, **kwargs: Any, ) -> torch.Tensor: - """Compute Linear output using quantized FP8 weights and activations (W8A8). - - Uses Python reference implementation (dequantize both + F.linear). - Future: can integrate TileLang kernel if available. - """ _ = quant_kind - - quant_scales = kwargs.pop("quant_scales", None) - - # Resolve / cache quantized weight + scales - if weight.dtype == torch.uint8: - if quant_scales is None: - raise ValueError("weight is uint8 (FP8) but quant_scales is None; expected per-channel scales tensor") - qweight = weight if weight.device == x.device else weight.to(device=x.device) - w_scales = quant_scales - # Prefer float16 scales for quality - if w_scales.dtype != torch.float16: - w_scales = w_scales.to(dtype=torch.float16) - if w_scales.device != x.device: - w_scales = w_scales.to(device=x.device) - weight_id = id(weight) + wid = id(weight) + cached = self._weight_cache.get(wid) + if cached is None or cached[0].device != x.device: + q_fp8, meta = self.quantize(weight) + q_fp8 = q_fp8.to(device=x.device) + w_scale = meta["scales"].to(device=x.device, dtype=torch.float32).reshape(1) + self._weight_cache[wid] = (q_fp8, w_scale) else: - weight_id = id(weight) - cached = self._weight_cache.get(weight_id) - if cached is None: - qweight, w_scales = self.quantize_weight_for_kernel(weight, device=x.device) - self._weight_cache[weight_id] = (qweight, w_scales) - else: - qweight, w_scales = cached - if qweight.device != x.device: - qweight = qweight.to(device=x.device) - w_scales = w_scales.to(device=x.device) - self._weight_cache[weight_id] = (qweight, w_scales) - - # Optional: use cuBLAS BF16 (dequant once) - import os - if os.getenv("DIFFULEX_FP8_W8A8_PREFER_CUBLAS", "0") == "1": - deq_key = weight_id - deq_w = self._dequant_weight_cache.get(deq_key) - if deq_w is None or deq_w.device != x.device: - deq_w = self.dequantize(qweight, w_scales) - self._dequant_weight_cache[deq_key] = deq_w - # Also dequantize activation - x_q_temp, x_scales_temp = self.quantize_act_for_kernel(x, device=x.device) - x_deq = self._dequantize_act(x_q_temp, x_scales_temp) - return F.linear(x_deq, deq_w, bias) - - # Quantize activation per-row - if x.dtype not in (torch.bfloat16, torch.float16, torch.float32): - x = x.to(torch.bfloat16) - x_q, x_scales = self.quantize_act_for_kernel(x, device=x.device) - - # Try to use TileLang kernel if available - # For W8A8, weight_dtype and act_dtype should match (both e4m3 or both e5m2) - fp8_w8a8_gemm = None - if self.weight_dtype_str == "fp8_e4m3" and self.act_dtype_str == "fp8_e4m3": - fp8_w8a8_gemm = _fp8_e4m3_w8a8_gemm - elif self.weight_dtype_str == "fp8_e5m2" and self.act_dtype_str == "fp8_e5m2": - fp8_w8a8_gemm = _fp8_e5m2_w8a8_gemm - - if _TILELANG_AVAILABLE and fp8_w8a8_gemm is not None: - try: - # Check device - if x.device.type != 'cuda': - return self._fallback_python_forward(x_q, x_scales, qweight, w_scales, bias) - - # Get shapes - M, K = x_q.shape - N, K_w = qweight.shape - assert K == K_w, f"K dimension mismatch: {K} != {K_w}" - - # Bucket M to reduce compilation churn - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 + q_fp8, w_scale = cached - x_q_for_kernel = x_q - if M_bucket != M: - x_q_pad = torch.zeros((M_bucket, K), device=x_q.device, dtype=x_q.dtype) - x_q_pad[:M, :] = x_q - x_q_for_kernel = x_q_pad - # Pad scales as well - x_scales_pad = torch.zeros((M_bucket,), device=x_scales.device, dtype=x_scales.dtype) - x_scales_pad[:M] = x_scales - x_scales = x_scales_pad + q_kn = q_fp8 - # TileLang autotune: use warmup + config cache pattern - cache_key = (str(x.device), M_bucket, N, K) - config = self._tl_autotune_config_cache.get(cache_key) - - if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None: - # Warmup phase: run autotune with real inputs - try: - assert self.act_spec.fp8_view_dtype is not None - assert self.weight_spec.fp8_view_dtype is not None - x_fp8 = x_q_for_kernel.view(self.act_spec.fp8_view_dtype) - w_fp8 = qweight.view(self.weight_spec.fp8_view_dtype) - with set_autotune_inputs([x_fp8, w_fp8, x_scales, w_scales]): - kernel = fp8_w8a8_gemm(M_bucket, N, K) - config = kernel.config - self._tl_autotune_config_cache[cache_key] = config - except Exception: - # Fallback to default config if autotune fails - config = None - - # Use cached config or default parameters - assert self.act_spec.fp8_view_dtype is not None - assert self.weight_spec.fp8_view_dtype is not None - x_fp8 = x_q_for_kernel.view(self.act_spec.fp8_view_dtype) - w_fp8 = qweight.view(self.weight_spec.fp8_view_dtype) - if config is not None: - kernel = fp8_w8a8_gemm(M_bucket, N, K, **config) - else: - # Default config (backward compatible) - kernel = fp8_w8a8_gemm(M_bucket, N, K, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128) - - # Call kernel - out_idx=[4] means output is the 5th parameter - # Inputs: A/B are fp8 tensors (viewed from uint8 storage), scales are float32/float16. - assert self.act_spec.fp8_view_dtype is not None - assert self.weight_spec.fp8_view_dtype is not None - x_fp8 = x_q_for_kernel.view(self.act_spec.fp8_view_dtype) - w_fp8 = qweight.view(self.weight_spec.fp8_view_dtype) - output_full = kernel(x_fp8, w_fp8, x_scales, w_scales) - output = output_full[:M, :] if M_bucket != M else output_full - - # Add bias if present - if bias is not None: - output = output + bias - - return output - except Exception as e: - # Fallback to Python implementation on any error - import warnings - error_msg = str(e) - - # Extract meaningful error information - if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg): - # CUDA architecture not supported - silently fallback - pass - elif 'Compilation error' in error_msg: - # Extract the actual error - idx = error_msg.find('Compilation error') - after = error_msg[idx + len('Compilation error'):] - lines = after.split('\n') - for line in lines: - line = line.strip() - if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()): - error_msg = f"CUDA compilation error: {line[:200]}" - break - else: - error_msg = "CUDA compilation error (see logs for details)" - elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower(): - # Pipeline stages mismatch - silently fallback - pass - else: - # Truncate very long error messages - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - - # Only warn for unexpected errors - if 'CUDA architecture not supported' not in error_msg and 'sm_' not in error_msg and 'Pipeline stages' not in error_msg: - warnings.warn( - f"TileLang kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - return self._fallback_python_forward(x_q, x_scales, qweight, w_scales, bias) - else: - # TileLang not available, use Python reference - return self._fallback_python_forward(x_q, x_scales, qweight, w_scales, bias) - - def _fallback_python_forward( - self, - x_q: torch.Tensor, - x_scales: torch.Tensor, - qweight: torch.Tensor, - w_scales: torch.Tensor, - bias: Optional[torch.Tensor], - ) -> torch.Tensor: - """Fallback Python implementation: dequantize both + F.linear.""" - # Dequantize both weight and activation - deq_w = self.dequantize(qweight, w_scales) - deq_x = self._dequantize_act(x_q, x_scales) - - # Compute linear output - return F.linear(deq_x, deq_w, bias) - - def _dequantize_act( - self, - quantized: torch.Tensor, - scales: torch.Tensor, - ) -> torch.Tensor: - """Dequantize FP8 activation tensor. - - Args: - quantized: uint8 tensor [M, K] (FP8 storage) - scales: float32 tensor [M] (per-row scales) - - Returns: - Dequantized tensor in bf16 [M, K] - """ - assert self.act_spec.fp8_view_dtype is not None - - # View uint8 as FP8 dtype - fp8_tensor = quantized.view(self.act_spec.fp8_view_dtype).to(torch.float32) - - # Reshape scales to broadcast: [M] -> [M, 1] - scales_view = scales.to(torch.float32).unsqueeze(-1) # [M, 1] - - # Dequantize: value * scale - dequantized = fp8_tensor * scales_view - return dequantized.to(torch.bfloat16) + return self._fp8_linear.apply( + input=x, + weight=q_kn, + weight_scale=w_scale, + out_dtype=x.dtype if x.dtype in (torch.bfloat16, torch.float16) else torch.bfloat16, + input_scale=None, + bias=bias, + ) diff --git a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py new file mode 100644 index 0000000..da81d3e --- /dev/null +++ b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py @@ -0,0 +1,156 @@ +""" +GPTQ Marlin (W4/W8, A16) Linear strategy using vLLM Marlin CUDA kernels. + +- Input activations: bf16 (cast to fp16 for vLLM marlin kernel) +- Weights: offline GPTQ vLLM standard format (qweight/qzeros/scales/g_idx) +- One-time repack/permutation is performed by Diffulex `LinearBase` and passed in via kwargs: + - gptq_marlin_qweight / gptq_marlin_scales / gptq_marlin_zp + - gptq_marlin_g_idx / gptq_marlin_g_idx_sort_indices + - gptq_marlin_workspace + +This strategy intentionally does NOT fall back to F.linear silently: if marlin tensors +are missing, it raises to avoid accidentally benchmarking a slow path. +""" + +from __future__ import annotations + +from typing import Any, Optional + +import torch +import torch.nn.functional as F + +from diffulex.utils.quantization.registry import register_linear_strategy +from diffulex.utils.quantization.strategy import LinearQuantizationStrategy + +try: + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore + apply_gptq_marlin_linear, + marlin_is_k_full, + marlin_make_empty_g_idx, + marlin_permute_bias, + ) + from vllm.scalar_type import scalar_types # type: ignore +except Exception: # pragma: no cover + apply_gptq_marlin_linear = None # type: ignore + marlin_is_k_full = None # type: ignore + marlin_make_empty_g_idx = None # type: ignore + marlin_permute_bias = None # type: ignore + scalar_types = None # type: ignore + + +@register_linear_strategy(weight_dtype="gptq_marlin", act_dtype="bf16") +def _build_linear_gptq_marlin_w4a16() -> LinearQuantizationStrategy: + return LinearGPTQMarlinW4A16Strategy() + + +class LinearGPTQMarlinW4A16Strategy(LinearQuantizationStrategy): + @property + def name(self) -> str: + return "linear_gptq_marlin_w4a16" + + @property + def linear_weight_format(self) -> str: + return "gptq_marlin" + + @property + def linear_act_format(self) -> str: + return "bf16" + + def get_storage_dtype(self) -> tuple[torch.dtype, int]: + return torch.int32, 4 + + def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: + # Same as GPTQ: [K/group, N] + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight shape, got {original_shape}") + out_features, in_features = original_shape + group_size = int(kwargs.get("group_size", 128)) + group_size = in_features if group_size == -1 else group_size + if group_size <= 0 or in_features % group_size != 0: + raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}") + num_groups = in_features // group_size + return (num_groups, out_features) + + def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: + return tensor, {} + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: + if quantized.is_floating_point(): + return quantized + raise NotImplementedError("GPTQ Marlin 不提供 Python dequantize;请使用 vLLM Marlin CUDA kernel。") + + def linear_forward( + self, + x: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor], + *, + quant_kind: str, + **kwargs: Any, + ) -> torch.Tensor: + _ = quant_kind, weight + if apply_gptq_marlin_linear is None or scalar_types is None: + raise RuntimeError("gptq_marlin 需要 vLLM (marlin_utils + scalar_types);当前环境不可用。") + + qweight = kwargs.get("gptq_marlin_qweight", None) + scales = kwargs.get("gptq_marlin_scales", None) + zp = kwargs.get("gptq_marlin_zp", None) + g_idx = kwargs.get("gptq_marlin_g_idx", None) + g_idx_sort_indices = kwargs.get("gptq_marlin_g_idx_sort_indices", None) + workspace = kwargs.get("gptq_marlin_workspace", None) + in_features = int(kwargs.get("in_features", 0)) + out_features = int(kwargs.get("out_features", 0)) + weight_bits = int(kwargs.get("gptq_weight_bits", 0)) + + if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0: + raise RuntimeError("gptq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).") + + if weight_bits == 4: + wtype = scalar_types.uint4b8 + elif weight_bits == 8: + wtype = scalar_types.uint8b128 + else: + raise RuntimeError(f"gptq_marlin: unsupported weight_bits={weight_bits} (expected 4 or 8)") + + # vLLM marlin kernels expect FP16 activations. + x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x + + # g_idx can be empty (desc_act=False). Ensure correct dtype/device. + if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): + g_idx_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32) + else: + g_idx_t = g_idx.to(device=x.device, dtype=torch.int32) + if g_idx_sort_indices is None or (isinstance(g_idx_sort_indices, torch.Tensor) and g_idx_sort_indices.numel() == 0): + g_idx_sort_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32) + else: + g_idx_sort_t = g_idx_sort_indices.to(device=x.device, dtype=torch.int32) + + # Determine whether K is full (needed by marlin kernel). Row-parallel layers set tp_dim=1 in Diffulex. + row_parallel = bool(kwargs.get("tp_dim", None) == 1) + has_g_idx = bool(g_idx_t.numel() > 0) + if marlin_is_k_full is None: + is_k_full = True + else: + is_k_full = marlin_is_k_full(has_g_idx, row_parallel) + + marlin_bias = None + if bias is not None: + marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias + + out = apply_gptq_marlin_linear( + input=x_in, + weight=qweight, + weight_scale=scales, + weight_zp=zp, + g_idx=g_idx_t, + g_idx_sort_indices=g_idx_sort_t, + workspace=workspace, + wtype=wtype, + output_size_per_partition=out_features, + input_size_per_partition=in_features, + is_k_full=is_k_full, + bias=marlin_bias, + input_dtype=None, + ) + return out.to(dtype=x.dtype) if out.dtype != x.dtype else out + diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py index c86c532..8fc67a5 100644 --- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py @@ -1,11 +1,15 @@ """ -GPTQ W4A16 Linear quantization strategy (GPTQ weight + bf16 activation). +GPTQ W4A16 Linear quantization strategy (vLLM standard format). -Implementation notes: -- Weight quantization: GPTQ format with groupwise quantization -- Activation: kept as bf16 (no activation quantization) -- Storage: GPTQ uses packed int4 weights (qweight), int4 zeros (qzeros), and per-group scales -- Forward path: Dequantize GPTQ weights to bf16, then use F.linear +- Weight format: vLLM GPTQ (packed int32 qweight/qzeros + fp16 scales) +- Activation: bf16 (no activation quantization) +- Forward: vLLM custom op `gptq_gemm` + +Design notes: +- Diffulex follows vLLM's fast path: run `gptq_shuffle` once (handled by + `LinearBase._maybe_prepare_offline_gptq`) and then call `gptq_gemm` with + `use_exllama=True`. +- No TileLang dependency. """ from __future__ import annotations @@ -18,178 +22,10 @@ from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -# Try to import TileLang kernel, fallback to None if not available -_TILELANG_AVAILABLE = False -try: - from diffulex_kernel.python.linear_kernels import gptq_w4a16_gemm - _TILELANG_AVAILABLE = True -except ImportError: - gptq_w4a16_gemm = None - try: - from diffulex.attention.metadata import is_warming_up - from tilelang.autotuner import set_autotune_inputs - _AUTOTUNE_AVAILABLE = True -except ImportError: - _AUTOTUNE_AVAILABLE = False - is_warming_up = lambda: False - set_autotune_inputs = lambda *args, **kwargs: lambda f: f - - -def _unpack_gptq_int4( - packed: torch.Tensor, - *, - out_features: int, - in_features: int, -) -> torch.Tensor: - """Unpack GPTQ packed int4 weights into int8 values. - - GPTQ packs 2 int4 values per int8 byte: - - Lower 4 bits: even columns - - Upper 4 bits: odd columns - - Args: - packed: int8 tensor [out_features, (in_features + 1) // 2] - out_features: Original output features - in_features: Original input features - - Returns: - unpacked: int8 tensor [out_features, in_features] with values in [-8, 7] - """ - if packed.dtype != torch.int8: - raise TypeError(f"packed weight must be int8, got {packed.dtype}") - - out_features_actual, packed_in = packed.shape - expected_packed_in = (in_features + 1) // 2 - if packed_in != expected_packed_in: - raise ValueError( - f"Packed input dimension mismatch: got {packed_in}, " - f"expected {expected_packed_in} for in_features={in_features}" - ) - if out_features_actual != out_features: - raise ValueError( - f"Output dimension mismatch: got {out_features_actual}, " - f"expected {out_features}" - ) - - # Interpret bytes as uint8 for bit manipulation - p_u8 = packed.view(torch.uint8) - # Extract lower and upper 4 bits - low_u8 = (p_u8 & 0x0F) # [0..15] - high_u8 = ((p_u8 >> 4) & 0x0F) # [0..15] - - # Convert unsigned nibble [0..15] to signed int4 [-8..7] - # Packing: int4 [-8, 7] + 8 -> uint8 [0, 15] - # Unpacking: uint8 [0, 15] - 8 -> int4 [-8, 7] - low_s = low_u8.to(torch.int16) - 8 - high_s = high_u8.to(torch.int16) - 8 - - # Interleave low/high along in_features - unpacked = torch.empty((out_features, packed_in * 2), device=packed.device, dtype=torch.int16) - unpacked[:, 0::2] = low_s - unpacked[:, 1::2] = high_s - unpacked = unpacked[:, :in_features].to(torch.int8) - return unpacked - - -def _dequantize_gptq( - qweight: torch.Tensor, - qzeros: torch.Tensor, - scales: torch.Tensor, - *, - out_features: int, - in_features: int, - group_size: int = 128, - g_idx: Optional[torch.Tensor] = None, -) -> torch.Tensor: - """Dequantize GPTQ weights to bf16. - - GPTQ uses groupwise quantization: - - Weight is quantized per group (group_size consecutive elements) - - Each group has its own scale and zero point - - g_idx (optional) maps each weight element to its group - - Args: - qweight: int8 tensor [out_features, (in_features + 1) // 2] packed int4 - qzeros: int8 tensor [(out_features + group_size - 1) // group_size, (in_features + 1) // 2] packed int4 - scales: float32 tensor [(out_features + group_size - 1) // group_size, in_features] - out_features: Output features - in_features: Input features - group_size: Group size for quantization (default: 128) - g_idx: Optional int32 tensor [in_features] mapping each weight to its group - - Returns: - dequantized: bf16 tensor [out_features, in_features] - """ - device = qweight.device - - # Unpack qweight to int8 [out_features, in_features] - w_int8 = _unpack_gptq_int4(qweight, out_features=out_features, in_features=in_features) - - # Unpack qzeros to int8 [num_groups, in_features] - num_groups = (out_features + group_size - 1) // group_size - if qzeros.shape[0] != num_groups: - raise ValueError( - f"qzeros shape mismatch: got {qzeros.shape[0]} groups, " - f"expected {num_groups} for out_features={out_features}, group_size={group_size}" - ) - zeros_int8 = _unpack_gptq_int4(qzeros, out_features=num_groups, in_features=in_features) - - # Ensure scales have correct shape [num_groups, in_features] - if scales.shape != (num_groups, in_features): - # If scales is [num_groups] or [num_groups, 1], broadcast to [num_groups, in_features] - if scales.shape == (num_groups,) or scales.shape == (num_groups, 1): - scales = scales.unsqueeze(-1).expand(num_groups, in_features) - else: - raise ValueError( - f"scales shape mismatch: got {scales.shape}, " - f"expected ({num_groups}, {in_features}) or ({num_groups},) or ({num_groups}, 1)" - ) - - # Convert to float32 for dequantization - w_fp32 = w_int8.to(torch.float32) - zeros_int8_fp32 = zeros_int8.to(torch.float32) # Quantized zeros (int8) - scales_fp32 = scales.to(torch.float32) - - # Dequantize zeros: zero = zero_quantized * scale - # zeros_int8 was quantized as: zero_quantized = round(zero / scale) - # So to recover: zero = zero_quantized * scale - zeros_fp32 = zeros_int8_fp32 * scales_fp32 # [num_groups, in_features] - - # Dequantize: (weight - zero) * scale - # w_int8 is [out_features, in_features] - # zeros_int8 is [num_groups, in_features] - # scales_fp32 is [num_groups, in_features] - - # For each output channel, determine which group it belongs to - if g_idx is not None: - # g_idx maps each output channel to its group - if g_idx.shape != (out_features,): - raise ValueError( - f"g_idx shape mismatch: got {g_idx.shape}, expected ({out_features},)" - ) - # g_idx: [out_features] -> group_id for each output channel - group_ids = g_idx.to(torch.int64) # [out_features] - # Clamp group_ids to valid range [0, num_groups-1] - group_ids = torch.clamp(group_ids, 0, num_groups - 1) - # Gather zeros and scales for each output channel - # zeros_fp32: [num_groups, in_features], group_ids: [out_features] - # We need to index along dimension 0 for each output channel - zeros_for_channel = zeros_fp32[group_ids] # [out_features, in_features] - scales_for_channel = scales_fp32[group_ids] # [out_features, in_features] - else: - # Without g_idx, assume sequential grouping: group_id = out_idx // group_size - group_ids = torch.arange(out_features, device=device) // group_size # [out_features] - # Clamp group_ids to valid range - group_ids = torch.clamp(group_ids, 0, num_groups - 1) - zeros_for_channel = zeros_fp32[group_ids] # [out_features, in_features] - scales_for_channel = scales_fp32[group_ids] # [out_features, in_features] - - # Dequantize: quantized * scale + zero - # Quantization formula: quantized = round((weight - zero) / scale) - # Dequantization formula: weight = quantized * scale + zero - dequantized = w_fp32 * scales_for_channel + zeros_for_channel - return dequantized.to(torch.bfloat16) + from vllm import _custom_ops as ops # type: ignore +except Exception: # pragma: no cover + ops = None # type: ignore @register_linear_strategy(weight_dtype="gptq", act_dtype="bf16") @@ -198,21 +34,6 @@ def _build_linear_gptq_w4a16() -> LinearQuantizationStrategy: class LinearGPTQW4A16Strategy(LinearQuantizationStrategy): - """GPTQ W4A16 Linear strategy: GPTQ weight quantization + bf16 activation. - - Current implementation: Python reference using dequantized weights + F.linear. - Weight quantization: GPTQ format with groupwise quantization (typically group_size=128). - Activation: kept as bf16 (no activation quantization). - - Lazy cache: Dequantized weights are cached to avoid re-dequantizing on every forward pass. - """ - - def __init__(self): - """Initialize strategy (no cache needed when using kernel).""" - super().__init__() - # TileLang autotune config cache: (device, M_bucket, N, K, num_groups, group_size) -> config dict - self._tl_autotune_config_cache: dict[tuple[str, int, int, int, int, int], dict] = {} - @property def name(self) -> str: return "linear_gptq_w4a16" @@ -226,101 +47,33 @@ def linear_act_format(self) -> str: return "bf16" def get_storage_dtype(self) -> tuple[torch.dtype, int]: - # GPTQ weights are stored as packed int8 (2 int4 per byte) - return torch.int8, 1 + # vLLM GPTQ stores packed weights in int32. + return torch.int32, 4 def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for GPTQ groupwise quantization. - - For [out_features, in_features] weight with group_size groups: - - scales shape is [(out_features + group_size - 1) // group_size, in_features] - """ - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - out_features, in_features = original_shape[0], original_shape[1] - group_size = kwargs.get("group_size", 128) - num_groups = (out_features + group_size - 1) // group_size - return (num_groups, in_features) - - def quantize(self, tensor: torch.Tensor, **kwargs): - """GPTQ quantization is typically done offline, so this is a placeholder.""" + # vLLM GPTQ scales: [K/group, N], where Linear weight is (N, K). + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight shape, got {original_shape}") + out_features, in_features = original_shape + group_size = int(kwargs.get("group_size", 128)) + group_size = in_features if group_size == -1 else group_size + if group_size <= 0 or in_features % group_size != 0: + raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}") + num_groups = in_features // group_size + return (num_groups, out_features) + + def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: + # Offline GPTQ is handled by `diffulex.utils.quantization.quantize_model`. + return tensor, {} + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: + if quantized.is_floating_point(): + return quantized raise NotImplementedError( - "GPTQ quantization should be done offline using GPTQ tools. " - "This strategy only supports loading pre-quantized weights." + "GPTQ dequantize is not implemented in Diffulex. " + "Use vLLM kernels via linear_forward." ) - def dequantize( - self, - quantized: torch.Tensor, - scale_or_metadata: Any, - **kwargs - ) -> torch.Tensor: - """Dequantize GPTQ weights. - - Args: - quantized: Not used (kept for interface compatibility) - scale_or_metadata: Dict with keys: - - 'qweight': int8 packed int4 weights - - 'qzeros': int8 packed int4 zeros - - 'scales': float32 per-group scales - - 'out_features': int - - 'in_features': int - - 'group_size': int (default: 128) - - 'g_idx': Optional int32 group indices - **kwargs: Additional arguments - - Returns: - Dequantized tensor in bf16 - """ - if not isinstance(scale_or_metadata, dict): - raise ValueError( - "GPTQ dequantize requires dict metadata with keys: " - "qweight, qzeros, scales, out_features, in_features, group_size (optional), g_idx (optional)" - ) - - qweight = scale_or_metadata["qweight"] - qzeros = scale_or_metadata["qzeros"] - scales = scale_or_metadata["scales"] - out_features = scale_or_metadata["out_features"] - in_features = scale_or_metadata["in_features"] - group_size = scale_or_metadata.get("group_size", 128) - g_idx = scale_or_metadata.get("g_idx", None) - - return _dequantize_gptq( - qweight=qweight, - qzeros=qzeros, - scales=scales, - out_features=out_features, - in_features=in_features, - group_size=group_size, - g_idx=g_idx, - ) - - def quantize_weight_for_kernel( - self, - weight: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """GPTQ quantization is done offline, so this should not be called.""" - raise NotImplementedError( - "GPTQ quantization should be done offline. " - "Use set_offline_quantized_weight() to load pre-quantized weights." - ) - - def quantize_act_for_kernel( - self, - x: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """No activation quantization for W4A16 (activation stays bf16).""" - if device is not None: - x = x.to(device=device) - return x, None - def linear_forward( self, x: torch.Tensor, @@ -330,211 +83,65 @@ def linear_forward( quant_kind: str, **kwargs: Any, ) -> torch.Tensor: - """Compute Linear output using GPTQ quantized weights (W4A16). - - Args: - x: Activation tensor [M, K] (bf16) - weight: Either bf16 weight [N, K] (fallback) or GPTQ metadata dict - bias: Optional bias tensor [N] - quant_kind: Quantization kind (unused) - **kwargs: May include: - - gptq_qweight: int8 packed int4 weights [N, (K+1)//2] - - gptq_qzeros: int8 packed int4 zeros [num_groups, (K+1)//2] - - gptq_scales: float32 scales [num_groups, K] - - gptq_group_size: int (default: 128) - - gptq_g_idx: Optional int32 group indices [N] - - out_features: int (N) - - in_features: int (K) - """ - _ = quant_kind - - # Check if GPTQ tensors are provided directly via kwargs - qweight = kwargs.pop("gptq_qweight", None) - qzeros = kwargs.pop("gptq_qzeros", None) - scales = kwargs.pop("gptq_scales", None) - group_size = kwargs.pop("gptq_group_size", 128) - g_idx = kwargs.pop("gptq_g_idx", None) - out_features = kwargs.pop("out_features", None) - in_features = kwargs.pop("in_features", None) - - # If GPTQ tensors are provided, use them - if qweight is not None and qzeros is not None and scales is not None: - if out_features is None or in_features is None: - # Infer from x shape - M, K = x.shape - if in_features is None: - in_features = K - if out_features is None: - # Infer from qweight shape - out_features = qweight.shape[0] - - M, K = x.shape - N = out_features - num_groups = (N + group_size - 1) // group_size - - # Handle scales shape: broadcast to [num_groups, in_features] if needed - if scales.shape == (num_groups,): - scales = scales.unsqueeze(-1).expand(num_groups, in_features) - elif scales.shape == (num_groups, 1): - scales = scales.expand(num_groups, in_features) - elif scales.shape != (num_groups, in_features): - raise ValueError( - f"scales shape mismatch: got {scales.shape}, " - f"expected ({num_groups}, {in_features}), ({num_groups},), or ({num_groups}, 1)" - ) - - # Handle GIdx: if None, create sequential indices - device = qweight.device - if g_idx is None: - g_idx = torch.arange(N, device=device, dtype=torch.int32) // group_size - else: - g_idx = g_idx.to(device=device, dtype=torch.int32) - - # Ensure all tensors are on the correct device - qweight = qweight.to(device=x.device) - qzeros = qzeros.to(device=x.device) - scales = scales.to(device=x.device, dtype=torch.float32) - g_idx = g_idx.to(device=x.device) - - # Try to use TileLang kernel if available - if _TILELANG_AVAILABLE and gptq_w4a16_gemm is not None: - try: - # Check device - if x.device.type != 'cuda': - return self._fallback_python_forward( - x, qweight, qzeros, scales, bias, - out_features=N, in_features=in_features, - group_size=group_size, g_idx=g_idx, - ) - - # M-bucketing: reduce JIT compilation churn - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_for_kernel = x - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype) - x_pad[:M, :] = x - x_for_kernel = x_pad - - # TileLang autotune: use warmup + config cache pattern - cache_key = (str(x.device), M_bucket, N, K, num_groups, group_size) - config = self._tl_autotune_config_cache.get(cache_key) - - if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None: - # Warmup phase: run autotune with real inputs - try: - with set_autotune_inputs([x_for_kernel, qweight, qzeros, scales, g_idx]): - kernel = gptq_w4a16_gemm(M_bucket, N, K, num_groups, group_size) - config = kernel.config - self._tl_autotune_config_cache[cache_key] = config - except Exception: - # Fallback to default config if autotune fails - config = None - - # Use cached config or default parameters - if config is not None: - kernel = gptq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, **config) - else: - # Default config (backward compatible) - kernel = gptq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128) - - # Call kernel - out_idx=[5] means output is the 6th parameter - output_full = kernel(x_for_kernel, qweight, qzeros, scales, g_idx) - output = output_full[:M, :] if M_bucket != M else output_full + _ = quant_kind, weight + if ops is None: + raise RuntimeError( + "vLLM is required for GPTQ W4A16 (missing `vllm._custom_ops`). " + "Please install/build vLLM with CUDA ops." + ) - # Add bias if present - if bias is not None: - output = output + bias + qweight = kwargs.get("gptq_qweight", None) + qzeros = kwargs.get("gptq_qzeros", None) + scales = kwargs.get("gptq_scales", None) + g_idx = kwargs.get("gptq_g_idx", None) - return output - except Exception as e: - # Fallback to Python implementation on any error - import warnings - error_msg = str(e) + if qweight is None or qzeros is None or scales is None: + return F.linear(x, weight, bias) - # Extract meaningful error information - if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg): - # CUDA architecture not supported - silently fallback - pass - elif 'Compilation error' in error_msg: - # Extract the actual error - idx = error_msg.find('Compilation error') - after = error_msg[idx + len('Compilation error'):] - lines = after.split('\n') - for line in lines: - line = line.strip() - if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()): - error_msg = f"CUDA compilation error: {line[:200]}" - break - else: - error_msg = "CUDA compilation error (see logs for details)" - warnings.warn( - f"TileLang GPTQ kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower(): - # Pipeline stages mismatch - silently fallback - pass - else: - # Warn for unexpected errors - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - warnings.warn( - f"TileLang GPTQ kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - return self._fallback_python_forward( - x, qweight, qzeros, scales, bias, - out_features=N, in_features=in_features, - group_size=group_size, g_idx=g_idx, - ) - else: - # TileLang not available, use Python fallback - return self._fallback_python_forward( - x, qweight, qzeros, scales, bias, - out_features=N, in_features=in_features, - group_size=group_size, g_idx=g_idx, - ) + use_v2_format = bool(kwargs.get("gptq_use_v2_format", False)) - # Fallback: if weight is a regular bf16 tensor, use it directly - if isinstance(weight, torch.Tensor) and weight.dtype == torch.bfloat16: - return F.linear(x, weight, bias) + # Infer weight_bits from packed shapes to support GPTQ W2/W4/W8. + # qzeros: [K/group, N/pack_factor] and qweight: [K/pack_factor, N] + if qzeros.shape[1] <= 0 or qweight.shape[1] % int(qzeros.shape[1]) != 0: + raise RuntimeError( + f"Invalid GPTQ packed shapes: qweight.shape={tuple(qweight.shape)}, " + f"qzeros.shape={tuple(qzeros.shape)}" + ) + pack_factor = int(qweight.shape[1]) // int(qzeros.shape[1]) + if 32 % pack_factor != 0: + raise RuntimeError( + f"Unsupported GPTQ pack_factor={pack_factor} (requires 32%pack_factor==0). " + f"qweight.shape={tuple(qweight.shape)}, qzeros.shape={tuple(qzeros.shape)}" + ) + weight_bits = 32 // pack_factor - raise ValueError( - "GPTQ strategy requires gptq_qweight, gptq_qzeros, and gptq_scales to be provided " - "via kwargs or weight must be a bf16 tensor (fallback mode)" - ) + # vLLM GPTQ kernels expect FP16 activations. + x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x + qweight = qweight.to(device=x.device, dtype=torch.int32) + qzeros = qzeros.to(device=x.device, dtype=torch.int32) + scales = scales.to(device=x.device, dtype=torch.float16) - def _fallback_python_forward( - self, - x: torch.Tensor, - qweight: torch.Tensor, - qzeros: torch.Tensor, - scales: torch.Tensor, - bias: Optional[torch.Tensor], - *, - out_features: int, - in_features: int, - group_size: int, - g_idx: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Fallback Python implementation: dequantize + F.linear.""" - dequant_weight = _dequantize_gptq( - qweight=qweight.to(device=x.device), - qzeros=qzeros.to(device=x.device), - scales=scales.to(device=x.device), - out_features=out_features, - in_features=in_features, - group_size=group_size, - g_idx=g_idx.to(device=x.device) if g_idx is not None else None, + if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): + g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int) + else: + g_idx_t = g_idx.to(device=x.device, dtype=torch.int) + + out_shape = x.shape[:-1] + (qweight.shape[-1],) + reshaped_x = x_in.reshape(-1, x_in.shape[-1]) + + output = ops.gptq_gemm( + reshaped_x, + qweight, + qzeros, + scales, + g_idx_t, + True, # use_exllama (vLLM shuffles weights into exllama-friendly layout) + use_v2_format, + weight_bits, ) - return F.linear(x, dequant_weight, bias) + if bias is not None: + output.add_(bias.to(dtype=output.dtype)) + output = output.reshape(out_shape) + # Keep output dtype consistent with input activations for downstream layers. + return output.to(dtype=x.dtype) if output.dtype != x.dtype else output - def clear_cache(self) -> None: - """Clear cache (no-op, kept for compatibility).""" - pass diff --git a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py index 9141437..e1b085e 100644 --- a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py @@ -1,41 +1,25 @@ """ -W4A16 Linear quantization strategy (int4 weight + bf16 activation). +W4A16 Linear quantization strategy (int4 weight + bf16 activation), TileLang-free. -Reference implementation using Python dequantization + torch.nn.functional.linear. -Int4 weights are packed into int8 (2 int4 values per int8 byte). +vLLM-aligned behavior: +- vLLM 在 sm89(如 4090)上并没有“在线 int4 -> 快 GEMM”的通用路径; + 真正的 int4 加速通常依赖 GPTQ/AWQ 的 marlin/cutlass 以及对应的离线权重格式。 +- 为避免“看起来是 int4 但实际在跑 bf16 GEMM”,默认禁止静默走 `F.linear` 慢路径。 -Future optimizations: -- Replace F.linear with custom Triton/TileLang kernel for int4 GEMM +如需临时允许 correctness-first 慢 fallback,可设置环境变量: + `DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1` """ from __future__ import annotations from typing import Any, Optional -import os import torch import torch.nn.functional as F from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -# Try to import TileLang kernel, fallback to None if not available -try: - from diffulex_kernel.python.linear_kernels import w4a16_gemm - _TILELANG_AVAILABLE = True -except ImportError: - _TILELANG_AVAILABLE = False - w4a16_gemm = None - -try: - from diffulex.attention.metadata import is_warming_up - from tilelang.autotuner import set_autotune_inputs - _AUTOTUNE_AVAILABLE = True -except ImportError: - _AUTOTUNE_AVAILABLE = False - is_warming_up = lambda: False - set_autotune_inputs = lambda *args, **kwargs: lambda f: f - @register_linear_strategy(weight_dtype="int4", act_dtype="bf16") def _build_linear_int4_w4a16() -> LinearQuantizationStrategy: @@ -43,29 +27,10 @@ def _build_linear_int4_w4a16() -> LinearQuantizationStrategy: class LinearInt4W4A16Strategy(LinearQuantizationStrategy): - """W4A16 Linear strategy: int4 weight quantization + bf16 activation. - - Current implementation: Python reference using dequantized weights + F.linear. - Weight quantization: per-output-channel symmetric quantization to int4. - Activation: kept as bf16 (no activation quantization). - - Int4 packing: Each int8 byte stores 2 int4 values (lower 4 bits and upper 4 bits). - Packed weight shape: [out_features, (in_features + 1) // 2] (int8) - - Lazy cache: Quantized weights are cached per weight tensor (by id) to avoid - re-quantizing on every forward pass. - """ - - def __init__(self): - """Initialize strategy with empty weight cache.""" + def __init__(self) -> None: super().__init__() - # Cache: weight_id -> (packed_weight_int8, scales) - # Using id(weight) as key since the same Parameter object is reused across forwards + # Cache: id(weight) -> (packed_int8 [N, ceil(K/2)], scales_fp32 [N]) self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} - # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory) - self._dequant_weight_cache: dict[int, torch.Tensor] = {} - # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict - self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {} @property def name(self) -> str: @@ -80,196 +45,60 @@ def linear_act_format(self) -> str: return "bf16" def get_storage_dtype(self) -> tuple[torch.dtype, int]: - # Weights are stored as int8 (1 byte per element), but each byte contains 2 int4 values - # So effective storage is 0.5 bytes per int4 weight element - return torch.int8, 1 # Physical storage is int8, but logical is int4 + return torch.int8, 1 + + def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]: + _ = kwargs + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight [N,K], got {original_shape}") + return (original_shape[0],) @staticmethod def _pack_int4_to_int8(int4_tensor: torch.Tensor) -> torch.Tensor: - """Pack int4 tensor into int8 format. - - Args: - int4_tensor: int8 tensor with values in range [-8, 7] (representing int4) - shape: [out_features, in_features] - - Returns: - Packed int8 tensor, shape: [out_features, (in_features + 1) // 2] - Each int8 byte contains 2 int4 values: lower 4 bits (first) and upper 4 bits (second) - """ - out_features, in_features = int4_tensor.shape - - # Clamp to int4 range [-8, 7] - int4_tensor = int4_tensor.clamp(-8, 7) - - # Convert to uint8 for easier bit manipulation - # Map [-8, 7] to [0, 15] by adding 8 - uint8_tensor = (int4_tensor + 8).to(torch.uint8) - - # Pad in_features to even number if needed - if in_features % 2 != 0: - # Pad with zeros (value 8 in uint8, which represents 0 in int4) - pad_size = 1 - padding = torch.zeros(out_features, pad_size, dtype=torch.uint8, device=uint8_tensor.device) + 8 - uint8_tensor = torch.cat([uint8_tensor, padding], dim=1) - padded_in_features = in_features + pad_size - else: - padded_in_features = in_features - - # Reshape to [out_features, in_features // 2, 2] - reshaped = uint8_tensor.view(out_features, padded_in_features // 2, 2) - - # Pack: first element in lower 4 bits, second element in upper 4 bits - # packed[i, j] = reshaped[i, j, 0] | (reshaped[i, j, 1] << 4) - packed = reshaped[:, :, 0] | (reshaped[:, :, 1] << 4) - - # Convert back to int8 - return packed.to(torch.int8) + # int4_tensor: int8 [N,K] values in [-8,7] + n, k = int4_tensor.shape + t = int4_tensor.clamp(-8, 7).to(torch.int16) + u = (t + 8).to(torch.uint8) # [0,15] + if k % 2 != 0: + u = torch.cat([u, torch.full((n, 1), 8, device=u.device, dtype=torch.uint8)], dim=1) + k = k + 1 + u2 = u.view(n, k // 2, 2) + packed = (u2[:, :, 0] | (u2[:, :, 1] << 4)).to(torch.int8) + return packed.contiguous() @staticmethod - def _unpack_int8_to_int4(packed_int8: torch.Tensor, original_in_features: int) -> torch.Tensor: - """Unpack int8 tensor back to int4 format. - - Args: - packed_int8: Packed int8 tensor, shape: [out_features, packed_size] - original_in_features: Original in_features dimension (before padding) - - Returns: - Unpacked int4 tensor (as int8 with values in range [-8, 7]), shape: [out_features, original_in_features] - """ - out_features, packed_size = packed_int8.shape - - # Convert to uint8 for bit manipulation - uint8_packed = packed_int8.to(torch.uint8) - - # Extract lower and upper 4 bits - lower = uint8_packed & 0x0F # Lower 4 bits - upper = (uint8_packed >> 4) & 0x0F # Upper 4 bits - - # Stack: [out_features, packed_size, 2] - unpacked_uint8 = torch.stack([lower, upper], dim=-1) - - # Reshape to [out_features, packed_size * 2] - unpacked_uint8 = unpacked_uint8.view(out_features, packed_size * 2) - - # Slice to original size (remove padding if any) - unpacked_uint8 = unpacked_uint8[:, :original_in_features] - - # Convert back to int4 range: [0, 15] -> [-8, 7] - unpacked_int4 = unpacked_uint8.to(torch.int8) - 8 - - return unpacked_int4 - - def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: - """Quantize tensor to int4 with per-channel (per-output) scales. - - Args: - tensor: Weight tensor of shape [out_features, in_features] - **kwargs: Additional arguments (unused for now) - - Returns: - (packed_weight_int8, scales): - - packed_weight_int8: int8 tensor shape [out_features, (in_features + 1) // 2] - - scales: [out_features] - """ - _ = kwargs - # Per-output-channel quantization: compute scale for each output channel - # shape: [out_features, in_features] -> scales shape: [out_features] - abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0] # [out_features, 1] - # Avoid division by zero - scales = abs_max.clamp(min=1e-8) / 7.0 # [out_features, 1] (int4 range is -8 to 7, so max abs is 7) - - # Quantize: round(clamp(tensor / scales, -8, 7)) - quantized_int4 = torch.round(tensor / scales).clamp(-8, 7).to(torch.int8) - scales_1d = scales.squeeze(-1) # [out_features] - - # Pack int4 into int8 - packed_weight = self._pack_int4_to_int8(quantized_int4) - - return packed_weight, scales_1d - - def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: - """Dequantize packed int4 tensor back to bf16 using per-channel scales. - - Args: - quantized: Packed int8 tensor [out_features, packed_size] - scale_or_metadata: scales tensor [out_features] or dict with 'scales' and 'original_in_features' - **kwargs: Additional arguments, may include 'original_in_features' - - Returns: - Dequantized tensor in bf16, shape [out_features, original_in_features] - """ + def _unpack_int8_to_int4(packed: torch.Tensor, *, original_k: int) -> torch.Tensor: + # packed: int8 [N, ceil(K/2)] (two nibbles per byte) + p = packed.view(torch.uint8) + low = (p & 0x0F).to(torch.int16) - 8 + high = ((p >> 4) & 0x0F).to(torch.int16) - 8 + n, pk = packed.shape + out = torch.empty((n, pk * 2), device=packed.device, dtype=torch.int16) + out[:, 0::2] = low + out[:, 1::2] = high + return out[:, :original_k].to(torch.int8).contiguous() + + def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]: _ = kwargs - if isinstance(scale_or_metadata, dict): - scales = scale_or_metadata.get("scales") - original_in_features = scale_or_metadata.get("original_in_features") - else: - scales = scale_or_metadata - # Try to infer original_in_features from quantized shape - # packed_size = (in_features + 1) // 2, so in_features = packed_size * 2 or packed_size * 2 - 1 - packed_size = quantized.shape[1] - # We'll use the maximum possible (packed_size * 2), caller should provide original_in_features if needed - original_in_features = packed_size * 2 - + if tensor.dim() != 2: + raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}") + w = tensor.to(torch.bfloat16) + abs_max = w.abs().amax(dim=-1, keepdim=True) # [N,1] + scales = (abs_max.clamp(min=1e-8) / 7.0).to(torch.float32).squeeze(-1) # [N] + q = torch.round(w.to(torch.float32) / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8) + packed = self._pack_int4_to_int8(q) + return packed, {"scales": scales} + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor: + original_k = int(kwargs.get("original_in_features", 0)) + if original_k <= 0: + raise ValueError("original_in_features is required to dequantize int4 weights") + scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata if scales is None: raise ValueError("scales required for dequantization") - - # Get original_in_features from kwargs if provided - original_in_features = kwargs.get("original_in_features", original_in_features) - - # Unpack int4 from int8 - unpacked_int4 = self._unpack_int8_to_int4(quantized, original_in_features) - - # Ensure scales have correct shape for broadcasting - if scales.dim() == 1: - scales = scales.unsqueeze(-1) # [out_features, 1] - - # Dequantize: quantized * scales - dequantized = unpacked_int4.to(torch.float32) * scales - return dequantized.to(torch.bfloat16) - - def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for per-channel quantization. - - For [out_features, in_features] weight, scales shape is [out_features]. - """ - _ = kwargs - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - # Per-output-channel: scales shape is [out_features] - return (original_shape[0],) - - def quantize_weight_for_kernel( - self, - weight: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """Quantize weight to int4 (packed as int8) with per-channel scales. - - Returns: - (packed_weight_int8, scales): - - packed_weight_int8: int8 [out, (in + 1) // 2] - - scales: [out] - """ - _ = kwargs - if device is not None: - weight = weight.to(device=device) - - packed_weight, scales = self.quantize(weight) - return packed_weight, scales - - def quantize_act_for_kernel( - self, - x: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """No activation quantization for W4A16 (activation stays bf16).""" - if device is not None: - x = x.to(device=device) - return x, None + q = self._unpack_int8_to_int4(quantized, original_k=original_k).to(torch.float32) + w = q * scales.to(torch.float32).unsqueeze(-1) + return w.to(torch.bfloat16) def linear_forward( self, @@ -280,241 +109,31 @@ def linear_forward( quant_kind: str, **kwargs: Any, ) -> torch.Tensor: - """Compute Linear output using quantized weights (W4A16). - - Uses Python reference implementation (dequant + F.linear). - Future: Replace with TileLang kernel for int4 GEMM. - - Args: - x: Activation tensor [M, K] (bf16) - weight: Either bf16 weight [N, K] or packed int8 weight [N, (K + 1) // 2] - bias: Optional bias tensor [N] - quant_kind: Quantization kind (unused) - **kwargs: May include quant_scales and original_in_features for load-time quantized weights - """ _ = quant_kind - - # If caller provides a pre-quantized packed int8 weight + scales (e.g., load-time quantized module), - # use them directly and DO NOT populate the lazy cache (to avoid double-storage). - quant_scales = kwargs.pop("quant_scales", None) - original_in_features = kwargs.pop("original_in_features", None) - - if weight.dtype == torch.int8: - if quant_scales is None: - raise ValueError("weight is int8 (packed int4) but quant_scales is None; expected per-channel scales tensor") - # We have activation K; that's the real in_features for this matmul. - # Using packed_size*2 is fragile (it breaks if the int4 weights are stored "unpacked" as int8[N, K]). - M, K = x.shape - if original_in_features is None: - original_in_features = K - - # Accept both representations: - # - packed int4: int8[N, (K+1)//2] where each byte holds 2 int4 - # - unpacked int4: int8[N, K] where each element is an int4 value stored in int8 - expected_packed_K = (K + 1) // 2 - if weight.shape[1] == expected_packed_K: - packed_weight = weight - elif weight.shape[1] == K: - # Unpacked int4 -> pack on-the-fly so we can use the same kernel path. - # Support both [-8, 7] (signed int4) and [0, 15] (uint4 stored in int8). - w = weight - if (w.min() >= 0) and (w.max() <= 15): - w = (w.to(torch.int16) - 8).to(torch.int8) - packed_weight = self._pack_int4_to_int8(w) - else: - raise ValueError( - f"Unexpected int4 weight shape for int8 weight: got {tuple(weight.shape)}, " - f"expected (N,{expected_packed_K}) for packed or (N,{K}) for unpacked." - ) - scales = quant_scales - if scales.dtype != torch.bfloat16: - scales = scales.to(dtype=torch.bfloat16) - if packed_weight.device != x.device: - packed_weight = packed_weight.to(device=x.device) - if scales.device != x.device: - scales = scales.to(device=x.device) + if not bool(int(__import__("os").environ.get("DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK", "0"))): + raise RuntimeError( + "当前平台/配置下 `int4` 在线量化没有可用的 vLLM 快 kernel(例如 4090/sm89 无 CUTLASS W4A8)。" + "为避免静默退化到 bf16 GEMM,已禁止 `F.linear` 慢 fallback。" + "请改用 `gptq/awq`(vLLM 标准打包格式)或设置 DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1 临时开启。" + ) + original_k = int(kwargs.get("original_in_features", x.shape[-1])) + quant_scales = kwargs.get("quant_scales", None) + + if weight is not None and weight.dtype == torch.int8 and quant_scales is not None: + packed = weight.to(device=x.device) + scales = quant_scales.to(device=x.device, dtype=torch.float32) else: - # Lazy cache: use weight tensor id as key (only for bf16/fp16 weights) - weight_id = id(weight) - - # Check cache - if weight_id in self._weight_cache: - packed_weight, scales = self._weight_cache[weight_id] - # Ensure cached tensors are on the correct device - if packed_weight.device != x.device: - packed_weight = packed_weight.to(device=x.device) - scales = scales.to(device=x.device) - # Get original_in_features from cached metadata or infer - if original_in_features is None: - # Infer: packed_size = (in_features + 1) // 2 - packed_size = packed_weight.shape[1] - original_in_features = packed_size * 2 + wid = id(weight) + cached = self._weight_cache.get(wid) + if cached is None or cached[0].device != x.device: + packed, meta = self.quantize(weight) + packed = packed.to(device=x.device) + scales = meta["scales"].to(device=x.device, dtype=torch.float32) + self._weight_cache[wid] = (packed, scales) else: - # Quantize weight and cache it - packed_weight, scales = self.quantize_weight_for_kernel(weight, device=x.device) - # Cache the packed weight and scales - self._weight_cache[weight_id] = (packed_weight, scales) - # Store original_in_features for later use - original_in_features = weight.shape[1] - - # Speed-first option: - # If enabled, dequantize once and reuse a cached bf16 weight for F.linear (cuBLAS). - # This trades extra GPU memory for throughput. - if os.getenv("DIFFULEX_W4A16_PREFER_CUBLAS", "0") == "1": - deq_key = id(weight) - deq_w = self._dequant_weight_cache.get(deq_key) - if deq_w is None or deq_w.device != x.device: - deq_w = self.dequantize( - packed_weight, - scales, - original_in_features=original_in_features, - ) - if deq_w.device != x.device: - deq_w = deq_w.to(device=x.device) - self._dequant_weight_cache[deq_key] = deq_w - return F.linear(x, deq_w, bias) - - # Try to use TileLang kernel if available - if _TILELANG_AVAILABLE and w4a16_gemm is not None: - try: - # Check device - if x.device.type != 'cuda': - return self._fallback_python_forward(x, packed_weight, scales, bias, original_in_features=original_in_features) - - # Check CUDA compute capability (skip kernel if unsupported) - try: - if torch.cuda.is_available(): - props = torch.cuda.get_device_properties(x.device.index or 0) - compute_cap = (props.major, props.minor) - # Let TileLang handle the check and fallback gracefully - pass - except Exception: - # If we can't check compute capability, still try the kernel - pass - - # Get shapes - M, K = x.shape - N, packed_K = packed_weight.shape - # Verify packed_K matches expected packed size for K - expected_packed_K = (original_in_features + 1) // 2 - assert packed_K == expected_packed_K, f"Packed K dimension mismatch: {packed_K} != {expected_packed_K}" - - # Reduce TileLang JIT compilation churn without killing small-M decode performance. - # Previous logic padded *any* M!=1 to 64/128/256, which can turn decode M=2/4 into M=64. - # We instead bucket to a small stable set: - # - for M<=64: next power-of-two (2,4,8,16,32,64) - # - for M>64: round up to a multiple of 64 - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_for_kernel = x - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype) - x_pad[:M, :] = x - x_for_kernel = x_pad + packed, scales = cached - # TileLang autotune: use warmup + config cache pattern - cache_key = (str(x.device), M_bucket, N, K) - config = self._tl_autotune_config_cache.get(cache_key) - - if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None: - # Warmup phase: run autotune with real inputs - try: - with set_autotune_inputs([x_for_kernel, packed_weight, scales]): - kernel = w4a16_gemm(M_bucket, N, K) - config = kernel.config - self._tl_autotune_config_cache[cache_key] = config - except Exception: - # Fallback to default config if autotune fails - config = None - - # Use cached config or default parameters - if config is not None: - kernel = w4a16_gemm(M_bucket, N, K, **config) - else: - # Default config (backward compatible) - kernel = w4a16_gemm(M_bucket, N, K, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128) - - # Call kernel - out_idx=[3] means output is the 4th parameter, - # so we only pass inputs (x, packed_weight, scales), and kernel returns output - output_full = kernel(x_for_kernel, packed_weight, scales) - output = output_full[:M, :] if M_bucket != M else output_full - - # Add bias if present - if bias is not None: - output = output + bias - - return output - except Exception as e: - # Fallback to Python implementation on any error - import warnings - error_msg = str(e) - - # Extract meaningful error information - if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg): - # CUDA architecture not supported - silently fallback - pass - elif 'Compilation error' in error_msg: - # Extract the actual error - idx = error_msg.find('Compilation error') - after = error_msg[idx + len('Compilation error'):] - lines = after.split('\n') - for line in lines: - line = line.strip() - if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()): - error_msg = f"CUDA compilation error: {line[:200]}" - break - else: - error_msg = "CUDA compilation error (see logs for details)" - warnings.warn( - f"TileLang W4A16 kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower(): - # Pipeline stages mismatch - silently fallback - pass - else: - # Warn for unexpected errors - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - warnings.warn( - f"TileLang W4A16 kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - return self._fallback_python_forward(x, packed_weight, scales, bias, original_in_features=original_in_features) - else: - # TileLang not available, use Python reference - return self._fallback_python_forward(x, packed_weight, scales, bias, original_in_features=original_in_features) - - def _fallback_python_forward( - self, - x: torch.Tensor, - packed_weight: torch.Tensor, - scales: torch.Tensor, - bias: Optional[torch.Tensor], - *, - original_in_features: int, - ) -> torch.Tensor: - """Fallback Python implementation: unpack + dequantize + F.linear.""" - # Unpack and dequantize - dequantized_weight = self.dequantize( - packed_weight, - scales, - original_in_features=original_in_features - ) - - # Compute linear output - return F.linear(x, dequantized_weight, bias) - - def clear_cache(self) -> None: - """Clear the weight quantization cache. - - Useful for memory management or when weights are updated (e.g., fine-tuning). - """ - self._weight_cache.clear() - self._dequant_weight_cache.clear() + # Slow fallback (explicitly opted-in). + w_deq = self.dequantize(packed, {"scales": scales}, original_in_features=original_k) + return F.linear(x, w_deq, bias) diff --git a/diffulex/utils/quantization/strategies/linear_int4_w4a8.py b/diffulex/utils/quantization/strategies/linear_int4_w4a8.py index f2287e0..decb19d 100644 --- a/diffulex/utils/quantization/strategies/linear_int4_w4a8.py +++ b/diffulex/utils/quantization/strategies/linear_int4_w4a8.py @@ -1,145 +1,25 @@ """ -W4A8 Linear quantization strategy (int4 weight + int8 activation). +W4A8 Linear quantization strategy (int4 weight + int8 activation), TileLang-free. -Notes: -- Weight is per-output-channel symmetric int4 packed into int8 (2 values per byte), with per-channel scales. -- Activation is quantized per-row to int8 with per-row scales. -- GEMM is performed by unpacking int4 -> int8 and using `torch._int_mm` (int8 x int8 -> int32). - For now we cache the unpacked (and transposed) weight to avoid repeated unpack. -- If int8 GEMM is not available, we fall back to unpack+dequant BF16 + cuBLAS (F.linear). +vLLM-aligned behavior: +- vLLM 的 CUTLASS W4A8 kernel 需要 sm90(Hopper);在 sm89(如 4090)上不可用。 +- 为避免静默退化到 bf16 GEMM,默认禁止 `F.linear` 慢 fallback。 + +如需临时允许 correctness-first 慢 fallback,可设置: + `DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1` """ from __future__ import annotations from typing import Any, Optional -import os -import warnings - import torch import torch.nn.functional as F -from diffulex.attention.metadata import is_warming_up from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -try: - from diffulex_kernel.python.linear_kernels import ( - w4a8_gemm, - w4a8_scaled_gemm, - w4a8_fused_act_gemm, - w8a8_act_quant, - ) - _TILELANG_AVAILABLE = True -except ImportError: - _TILELANG_AVAILABLE = False - w4a8_gemm = None - w4a8_scaled_gemm = None - w8a8_act_quant = None - w4a8_fused_act_gemm = None - -try: - # Optional: only needed for TileLang autotune warmup. - from tilelang.autotuner import set_autotune_inputs # type: ignore -except Exception: - set_autotune_inputs = None - - -_DEFAULT_TL_LINEAR_CFG: dict[str, Any] = { - "block_M": 64, - "block_N": 64, - "block_K": 128, - "num_stages": 2, - "threads": 128, -} - - -def _quantize_per_row_int8_torch(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - abs_max = x.abs().amax(dim=-1, keepdim=False) # [M] - scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float32) # [M] - x_q = torch.round(x.to(torch.float32) / scales.unsqueeze(-1)).clamp(-127, 127).to(torch.int8) - return x_q, scales - - -def _quantize_per_row_int8(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Per-row symmetric int8 quantization with optional TileLang fused kernel. - - Default: use TileLang fused kernel if available, otherwise fall back to torch ops. - - Env: - - DIFFULEX_W4A8_USE_TL_ACT_QUANT=0 to force torch fallback. - """ - use_tl = os.getenv("DIFFULEX_W4A8_USE_TL_ACT_QUANT", "1") == "1" - if ( - use_tl - and _TILELANG_AVAILABLE - and (w8a8_act_quant is not None) - and x.is_cuda - and x.dtype == torch.bfloat16 - and x.is_contiguous() - and x.dim() == 2 - ): - m, k = x.shape - if m <= 16: - block_m = 16 - elif m <= 32: - block_m = 32 - else: - block_m = 64 - try: - kernel = w8a8_act_quant( - m, - k, - block_M=block_m, - block_K=256, - threads=128, - ) - x_q, scales = kernel(x) - return x_q, scales - except Exception: - pass - return _quantize_per_row_int8_torch(x) - - -def _int8_mm(a_int8: torch.Tensor, b_int8: torch.Tensor) -> torch.Tensor: - if hasattr(torch, "_int_mm"): - return torch._int_mm(a_int8, b_int8) - if hasattr(torch.ops.aten, "_int_mm"): - return torch.ops.aten._int_mm(a_int8, b_int8) - raise RuntimeError("No int8 GEMM backend found (torch._int_mm / aten._int_mm missing)") - - -def _unpack_int4_packed_int8(packed: torch.Tensor, *, original_in_features: int) -> torch.Tensor: - """Unpack int4 weights stored in int8 bytes (2 nibbles per byte) into int8 values in [-8, 7]. - - Args: - packed: int8 [N, ceil(K/2)] - original_in_features: K - Returns: - unpacked: int8 [N, K] - """ - if packed.dtype != torch.int8: - raise TypeError(f"packed weight must be int8, got {packed.dtype}") - N, packed_K = packed.shape - expected = (original_in_features + 1) // 2 - if packed_K != expected: - raise ValueError(f"Packed K mismatch: got {packed_K}, expected {expected} for K={original_in_features}") - - # Interpret bytes as uint8 so we can shift/mask predictably. - p_u8 = packed.view(torch.uint8) - low = (p_u8 & 0x0F).to(torch.int16) - high = ((p_u8 >> 4) & 0x0F).to(torch.int16) - - # Convert unsigned nibble [0..15] to signed int4 [-8..7] - low_s = torch.where(low >= 8, low - 16, low) - high_s = torch.where(high >= 8, high - 16, high) - - # Interleave low/high along K - out = torch.empty((N, packed_K * 2), device=packed.device, dtype=torch.int16) - out[:, 0::2] = low_s - out[:, 1::2] = high_s - out = out[:, :original_in_features].to(torch.int8) - return out +from .linear_int4_w4a16 import LinearInt4W4A16Strategy @register_linear_strategy(weight_dtype="int4", act_dtype="int8") @@ -148,17 +28,9 @@ def _build_linear_int4_w4a8() -> LinearQuantizationStrategy: class LinearInt4W4A8Strategy(LinearQuantizationStrategy): - def __init__(self): + def __init__(self) -> None: super().__init__() - # bf16 weight id -> (packed_int8[N,ceil(K/2)], scales_bf16[N]) - self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} - # (packed_id, K) -> unpacked_int8[N,K] - self._unpacked_cache: dict[tuple[int, int], torch.Tensor] = {} - # (packed_id, K) -> unpacked_t_int8[K,N] - self._unpacked_t_cache: dict[tuple[int, int], torch.Tensor] = {} - self._dequant_weight_cache: dict[int, torch.Tensor] = {} - # (device_index, M_bucket, N, K) -> TileLang config dict for fused kernel - self._tl_fused_cfg_cache: dict[tuple[int, int, int, int], dict[str, Any]] = {} + self._w4a16 = LinearInt4W4A16Strategy() @property def name(self) -> str: @@ -173,71 +45,16 @@ def linear_act_format(self) -> str: return "int8" def get_storage_dtype(self) -> tuple[torch.dtype, int]: - # stored as packed int8 bytes (2 weights per byte) return torch.int8, 1 - def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for per-channel quantization. - - For [out_features, in_features] weight, scales shape is [out_features]. - """ - _ = kwargs - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - # Per-output-channel: scales shape is [out_features] - return (original_shape[0],) + def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]: + return self._w4a16.get_scale_shape(original_shape, **kwargs) - def clear_cache(self) -> None: - self._weight_cache.clear() - self._unpacked_cache.clear() - self._unpacked_t_cache.clear() - self._dequant_weight_cache.clear() - self._tl_fused_cfg_cache.clear() + def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]: + return self._w4a16.quantize(tensor, **kwargs) - def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: - _ = kwargs - # Per-output-channel symmetric int4 quantization: scale = absmax/7 - abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0] # [N,1] - # Keep scales in fp16 to reduce scale quantization error (A8 paths are sensitive). - scales = (abs_max.clamp(min=1e-8) / 7.0).to(torch.float16) # [N,1] - q = torch.round(tensor / scales).clamp(-8, 7).to(torch.int16) # [N,K] - - # Pack two int4 into one byte: low nibble for even k, high nibble for odd k. - N, K = q.shape - packed_K = (K + 1) // 2 - q_even = q[:, 0::2] - q_odd = q[:, 1::2] - if q_odd.shape[1] != q_even.shape[1]: - q_odd = torch.nn.functional.pad(q_odd, (0, 1), value=0) - - q_even_u = (q_even & 0x0F).to(torch.uint8) - q_odd_u = (q_odd & 0x0F).to(torch.uint8) - packed_u8 = q_even_u | (q_odd_u << 4) # [N, packed_K] - packed_i8 = packed_u8.view(torch.int8) - return packed_i8, scales.squeeze(-1) - - def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: - original_in_features = kwargs.get("original_in_features", None) - if original_in_features is None: - raise ValueError("original_in_features is required for int4 dequantize") - scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata - if scales is None: - raise ValueError("scales required for dequantization") - w_i8 = _unpack_int4_packed_int8(quantized, original_in_features=original_in_features) # [N,K] - deq = w_i8.to(torch.float32) * scales.to(torch.float32).unsqueeze(-1) - return deq.to(torch.bfloat16) - - def quantize_weight_for_kernel( - self, - weight: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - _ = kwargs - if device is not None: - weight = weight.to(device=device) - return self.quantize(weight) + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor: + return self._w4a16.dequantize(quantized, scale_or_metadata, **kwargs) def linear_forward( self, @@ -249,259 +66,12 @@ def linear_forward( **kwargs: Any, ) -> torch.Tensor: _ = quant_kind - quant_scales = kwargs.pop("quant_scales", None) - original_in_features = kwargs.pop("original_in_features", None) - if original_in_features is None: - raise ValueError("W4A8 requires original_in_features for packed int4 weights") - - # Resolve / cache packed weight + scales - if weight.dtype == torch.int8: - if quant_scales is None: - raise ValueError("weight is int8 (packed int4) but quant_scales is None") - packed = weight if weight.device == x.device else weight.to(device=x.device) - w_scales = quant_scales - # Prefer fp16 scales for quality (and fused kernel expects fp16 scales). - if w_scales.dtype != torch.float16: - w_scales = w_scales.to(dtype=torch.float16) - if w_scales.device != x.device: - w_scales = w_scales.to(device=x.device) - weight_id = id(weight) - else: - weight_id = id(weight) - cached = self._weight_cache.get(weight_id) - if cached is None: - packed, w_scales = self.quantize_weight_for_kernel(weight, device=x.device) - self._weight_cache[weight_id] = (packed, w_scales) - else: - packed, w_scales = cached - if packed.device != x.device: - packed = packed.to(device=x.device) - w_scales = w_scales.to(device=x.device) - self._weight_cache[weight_id] = (packed, w_scales) - - # Optional: dequant once and use cuBLAS BF16 - if os.getenv("DIFFULEX_W4A8_PREFER_CUBLAS", "0") == "1": - deq_key = weight_id - deq_w = self._dequant_weight_cache.get(deq_key) - if deq_w is None or deq_w.device != x.device: - deq_w = self.dequantize(packed, w_scales, original_in_features=original_in_features) - self._dequant_weight_cache[deq_key] = deq_w - return F.linear(x, deq_w, bias) - - # Quantize activation per-row to int8 - if x.dtype not in (torch.bfloat16, torch.float16, torch.float32): - x = x.to(torch.bfloat16) - if x.dtype != torch.bfloat16: - x = x.to(torch.bfloat16) - - # Try TileLang fused quant + GEMM first (bf16 activation input). - use_fused = os.getenv("DIFFULEX_W4A8_USE_TL_FUSED_GEMM", "1") == "1" - if ( - use_fused - and _TILELANG_AVAILABLE - and (w4a8_fused_act_gemm is not None) - and x.is_cuda - and x.dtype == torch.bfloat16 - and x.dim() == 2 - and x.is_contiguous() - ): - try: - M, K = x.shape - N, packed_K = packed.shape - expected_packed_K = (original_in_features + 1) // 2 - assert packed_K == expected_packed_K, ( - f"Packed K mismatch: got {packed_K}, expected {expected_packed_K} for K={original_in_features}" - ) - - # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16) - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_for_kernel = x - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.bfloat16) - x_pad[:M, :] = x - x_for_kernel = x_pad - - dev_idx = x.device.index or 0 - cfg_key = (dev_idx, M_bucket, N, original_in_features) - cfg = self._tl_fused_cfg_cache.get(cfg_key) - kernel = None - - # TileLang autotune (warmup-only): we set real inputs so the autotuner can benchmark configs. - if cfg is None and is_warming_up() and set_autotune_inputs is not None: - try: - with set_autotune_inputs([x_for_kernel, packed, w_scales]): - kernel = w4a8_fused_act_gemm(M_bucket, N, original_in_features) - cfg = kernel.config - self._tl_fused_cfg_cache[cfg_key] = cfg - except Exception: - # Cache a safe default to avoid retriggering autotune for this key. - cfg = _DEFAULT_TL_LINEAR_CFG - self._tl_fused_cfg_cache[cfg_key] = cfg - - if cfg is None: - cfg = _DEFAULT_TL_LINEAR_CFG - self._tl_fused_cfg_cache[cfg_key] = cfg - - if kernel is None: - kernel = w4a8_fused_act_gemm(M_bucket, N, original_in_features, **cfg) - out_full = kernel(x_for_kernel, packed, w_scales) - out = out_full[:M, :] if M_bucket != M else out_full - if bias is not None: - out = out + bias - return out - except Exception as e: - error_msg = str(e) - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - warnings.warn( - f"W4A8 fused quant GEMM failed, falling back to quantize+GEMM: {error_msg}", - UserWarning, - ) - - # Step-local cache for activation quantization (reuse within one step for QKV/gate-up, etc.) - use_cache = os.getenv("DIFFULEX_W4A8_ACT_QUANT_CACHE", "1") == "1" - cached = None - if use_cache: - try: - from diffulex.utils.quantization.context import get_cached_act_quant, set_cached_act_quant - cached = get_cached_act_quant(x) - except Exception: - cached = None - if cached is not None: - x_q, x_scales = cached - else: - x_q, x_scales = _quantize_per_row_int8(x) - if use_cache: - try: - set_cached_act_quant(x, x_q, x_scales) - except Exception: - pass - if x_q.device != x.device: - x_q = x_q.to(device=x.device) - x_scales = x_scales.to(device=x.device) - - # Get shapes - M, K = x_q.shape - N, packed_K = packed.shape - expected_packed_K = (original_in_features + 1) // 2 - assert packed_K == expected_packed_K, f"Packed K mismatch: got {packed_K}, expected {expected_packed_K} for K={original_in_features}" - - # Try TileLang kernel first if available (uses packed weights directly) - if _TILELANG_AVAILABLE and (w4a8_scaled_gemm is not None or w4a8_gemm is not None): - try: - # Check device - if x.device.type != 'cuda': - # Fall through to _int8_mm fallback - pass - else: - # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16) - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_q_for_kernel = x_q - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8) - x_pad[:M, :] = x_q - x_q_for_kernel = x_pad - x_scales_pad = torch.zeros((M_bucket,), device=x.device, dtype=torch.float32) - x_scales_pad[:M] = x_scales.to(torch.float32) - x_scales_for_kernel = x_scales_pad - else: - x_scales_for_kernel = x_scales.to(torch.float32) - - # Prefer fused-scale kernel: outputs bf16 directly. - if w4a8_scaled_gemm is not None: - kernel = w4a8_scaled_gemm( - M_bucket, - N, - original_in_features, - block_M=64, - block_N=64, - block_K=128, - num_stages=2, - threads=128, - ) - out_full = kernel(x_q_for_kernel, packed, x_scales_for_kernel, w_scales) - out = out_full[:M, :] if M_bucket != M else out_full - else: - # Fallback to int32-output kernel + python scaling - kernel = w4a8_gemm( - M_bucket, - N, - original_in_features, - block_M=64, - block_N=64, - block_K=128, - num_stages=2, - threads=128, - ) - out_i32_full = kernel(x_q_for_kernel, packed) - out_i32 = out_i32_full[:M, :] if M_bucket != M else out_i32_full - - out_fp32 = out_i32.to(torch.float32) - out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1) - out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0) - out = out_fp32.to(torch.bfloat16) - - if bias is not None: - out = out + bias - return out - except Exception as e: - # Fallback to _int8_mm on any kernel error - error_msg = str(e) - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - warnings.warn(f"W4A8 TileLang kernel failed, falling back to torch._int_mm: {error_msg}", UserWarning) - - # Fallback: unpack weight and use torch._int_mm - # Unpack weight to int8 and cache - packed_key = (id(packed), int(original_in_features)) - w_i8 = self._unpacked_cache.get(packed_key) - if w_i8 is None or w_i8.device != x.device: - w_i8 = _unpack_int4_packed_int8(packed, original_in_features=original_in_features) - self._unpacked_cache[packed_key] = w_i8 - - wt = self._unpacked_t_cache.get(packed_key) - if wt is None or wt.device != x.device: - wt = w_i8.t().contiguous() - self._unpacked_t_cache[packed_key] = wt - - # Pad small M for backend constraints (M > 16) - if M <= 16: - M_bucket = 17 - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8) - x_pad[:M, :] = x_q - x_q_for_mm = x_pad - else: - x_q_for_mm = x_q - - try: - out_i32_full = _int8_mm(x_q_for_mm, wt) - except Exception as e: - msg = str(e) - if len(msg) > 200: - msg = msg[:200] + "..." - warnings.warn(f"W4A8 int8 GEMM failed, falling back to BF16 F.linear: {msg}", UserWarning) - deq_w = self.dequantize(packed, w_scales, original_in_features=original_in_features) - return F.linear(x, deq_w, bias) - - out_i32 = out_i32_full[:M, :] if M <= 16 else out_i32_full - out_fp32 = out_i32.to(torch.float32) - out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1) - out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0) - out = out_fp32.to(torch.bfloat16) - if bias is not None: - out = out + bias - return out - + if not bool(int(__import__("os").environ.get("DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK", "0"))): + raise RuntimeError( + "当前平台/配置下 `int4` 在线量化没有可用的 vLLM 快 kernel(例如 4090/sm89 无 CUTLASS W4A8)。" + "为避免静默退化到 bf16 GEMM,已禁止 `F.linear` 慢 fallback。" + "请改用 `gptq/awq`(vLLM 标准打包格式)或设置 DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1 临时开启。" + ) + # Correctness-first: reuse W4A16 implementation. + return self._w4a16.linear_forward(x, weight, bias, quant_kind="other", **kwargs) diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_int8_w8a16.py index d3e4db9..67ab104 100644 --- a/diffulex/utils/quantization/strategies/linear_int8_w8a16.py +++ b/diffulex/utils/quantization/strategies/linear_int8_w8a16.py @@ -1,546 +1,29 @@ """ W8A16 Linear quantization strategy (int8 weight + bf16 activation). -Reference implementation using Python dequantization + torch.nn.functional.linear. -Future optimizations: -- Lazy cache quantized weights per module instance -- Replace F.linear with custom Triton/TileLang kernel for int8 GEMM +This path is now implemented by reusing Diffulex's marlin(AllSpark)-style W8A16 +strategy, which matches vLLM's effective fast path and avoids TileLang. """ from __future__ import annotations -from typing import Any, Optional - -import os -import torch -import torch.nn.functional as F - from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -# Try to import TileLang kernel, fallback to None if not available -try: - from diffulex_kernel.python.linear_kernels import w8a16_gemm - _TILELANG_AVAILABLE = True -except ImportError: - _TILELANG_AVAILABLE = False - w8a16_gemm = None +from .linear_marlin_int8_w8a16 import LinearMarlinInt8W8A16Strategy -try: - from diffulex_kernel.python.linear_kernels import w8a16_gemm_bias -except ImportError: - w8a16_gemm_bias = None -try: - from diffulex.attention.metadata import is_warming_up - from tilelang.autotuner import set_autotune_inputs - _AUTOTUNE_AVAILABLE = True -except ImportError: - _AUTOTUNE_AVAILABLE = False - is_warming_up = lambda: False - set_autotune_inputs = lambda *args, **kwargs: lambda f: f +class LinearInt8W8A16Strategy(LinearMarlinInt8W8A16Strategy): + """ + Compatibility alias for the historical Diffulex strategy name. + + This keeps the registry and `strategies.__init__` imports stable while + reusing the vLLM-aligned marlin(AllSpark) W8A16 implementation. + """ @register_linear_strategy(weight_dtype="int8", act_dtype="bf16") def _build_linear_int8_w8a16() -> LinearQuantizationStrategy: + # Alias to marlin(AllSpark) W8A16 implementation. return LinearInt8W8A16Strategy() - -class LinearInt8W8A16Strategy(LinearQuantizationStrategy): - """W8A16 Linear strategy: int8 weight quantization + bf16 activation. - - Current implementation: Python reference using dequantized weights + F.linear. - Weight quantization: per-output-channel symmetric quantization to int8. - Activation: kept as bf16 (no activation quantization). - - Lazy cache: Quantized weights are cached per weight tensor (by id) to avoid - re-quantizing on every forward pass. - """ - - def __init__(self): - """Initialize strategy with empty weight cache.""" - super().__init__() - # Cache: weight_id -> (quantized_weight, scales) - # Using id(weight) as key since the same Parameter object is reused across forwards - self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} - # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory) - self._dequant_weight_cache: dict[int, torch.Tensor] = {} - # bias cache for fused-bias kernel (store fp16 copy on device) - self._bias_f16_cache: dict[int, torch.Tensor] = {} - # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict - self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {} - # Lightweight runtime observability (opt-in by env var) - self._rt_call_count: int = 0 - self._rt_fallback_count: int = 0 - self._rt_m_hist_le64: dict[int, int] = {} - - def _rt_enabled(self) -> bool: - return os.getenv("DIFFULEX_LINEAR_PROFILE", "0") == "1" - - def _rt_log_every(self) -> int: - try: - return int(os.getenv("DIFFULEX_LINEAR_PROFILE_EVERY", "200")) - except Exception: - return 200 - - def _rt_on_call(self, *, m: int, n: int, k: int) -> None: - if not self._rt_enabled(): - return - self._rt_call_count += 1 - if m <= 64: - self._rt_m_hist_le64[m] = self._rt_m_hist_le64.get(m, 0) + 1 - every = self._rt_log_every() - if every > 0 and (self._rt_call_count % every == 0): - top = sorted(self._rt_m_hist_le64.items(), key=lambda kv: (-kv[1], kv[0]))[:8] - top_str = ", ".join([f"M={mm}:{cc}" for mm, cc in top]) if top else "empty" - print( - f"[DIFFULEX_LINEAR_PROFILE][w8a16] calls={self._rt_call_count} " - f"fallbacks={self._rt_fallback_count} last(M,N,K)=({m},{n},{k}) " - f"M_hist_le64_top={top_str}", - flush=True, - ) - - def _rt_on_fallback(self, *, m: int, n: int, k: int, reason: str) -> None: - if not self._rt_enabled(): - return - self._rt_fallback_count += 1 - # Avoid spam: only print first few fallbacks, then rely on periodic summary. - max_print = 5 - try: - max_print = int(os.getenv("DIFFULEX_LINEAR_FALLBACK_MAX_PRINT", "5")) - except Exception: - pass - if self._rt_fallback_count <= max_print: - print( - f"[DIFFULEX_LINEAR_PROFILE][w8a16][FALLBACK] " - f"count={self._rt_fallback_count} (M,N,K)=({m},{n},{k}) reason={reason}", - flush=True, - ) - - @property - def name(self) -> str: - return "linear_int8_w8a16" - - @property - def linear_weight_format(self) -> str: - return "int8" - - @property - def linear_act_format(self) -> str: - return "bf16" - - def get_storage_dtype(self) -> tuple[torch.dtype, int]: - # Weights are stored as int8 (1 byte per element) - return torch.int8, 1 - - def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: - """Quantize tensor to int8 with per-channel (per-output) scales. - - Args: - tensor: Weight tensor of shape [out_features, in_features] - **kwargs: Additional arguments (unused for now) - - Returns: - (quantized_tensor, scales): quantized_tensor is int8, scales is [out_features] - """ - _ = kwargs - # Per-output-channel quantization: compute scale for each output channel - # shape: [out_features, in_features] -> scales shape: [out_features] - abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0] # [out_features, 1] - # Avoid division by zero - scales = abs_max.clamp(min=1e-8) / 127.0 # [out_features, 1] - - # Quantize: round(clamp(tensor / scales, -128, 127)) - quantized = torch.round(tensor / scales).clamp(-128, 127).to(torch.int8) - scales_1d = scales.squeeze(-1) # [out_features] - - return quantized, scales_1d - - def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: - """Dequantize int8 tensor back to bf16 using per-channel scales. - - Args: - quantized: int8 tensor [out_features, in_features] - scale_or_metadata: scales tensor [out_features] or dict with 'scales' - **kwargs: Additional arguments (unused for now) - - Returns: - Dequantized tensor in bf16 - """ - _ = kwargs - if isinstance(scale_or_metadata, dict): - scales = scale_or_metadata.get("scales") - else: - scales = scale_or_metadata - - if scales is None: - raise ValueError("scales required for dequantization") - - # Ensure scales have correct shape for broadcasting - if scales.dim() == 1: - scales = scales.unsqueeze(-1) # [out_features, 1] - - # Dequantize: quantized * scales - dequantized = quantized.to(torch.float32) * scales - return dequantized.to(torch.bfloat16) - - def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for per-channel quantization. - - For [out_features, in_features] weight, scales shape is [out_features]. - """ - _ = kwargs - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - # Per-output-channel: scales shape is [out_features] - return (original_shape[0],) - - def quantize_weight_for_kernel( - self, - weight: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """Quantize weight to int8 with per-channel scales. - - Returns: - (quantized_weight, scales): quantized_weight is int8 [out, in], scales is [out] - """ - _ = kwargs - if device is not None: - weight = weight.to(device=device) - - quantized, scales = self.quantize(weight) - return quantized, scales - - def quantize_act_for_kernel( - self, - x: torch.Tensor, - *, - device: torch.device | None = None, - **kwargs: Any, - ) -> tuple[torch.Tensor, Any]: - """No activation quantization for W8A16 (activation stays bf16).""" - if device is not None: - x = x.to(device=device) - return x, None - - def linear_forward( - self, - x: torch.Tensor, - weight: torch.Tensor, - bias: Optional[torch.Tensor], - *, - quant_kind: str, - **kwargs: Any, - ) -> torch.Tensor: - """Compute Linear output using quantized weights (W8A16). - - Uses TileLang kernel if available and conditions are met, otherwise falls back - to Python reference implementation (dequant + F.linear). - - Conditions for using TileLang kernel: - - TileLang is available - - Device is CUDA - - (Kernel supports tail sizes; no K%128 constraint required) - """ - _ = quant_kind - - # If caller provides a pre-quantized int8 weight + scales (e.g., load-time quantized module), - # use them directly and DO NOT populate the lazy cache (to avoid double-storage). - quant_scales = kwargs.pop("quant_scales", None) - if weight.dtype == torch.int8: - if quant_scales is None: - raise ValueError("weight is int8 but quant_scales is None; expected per-channel scales tensor") - quantized_weight = weight - scales = quant_scales - if scales.dtype != torch.bfloat16: - scales = scales.to(dtype=torch.bfloat16) - if quantized_weight.device != x.device: - quantized_weight = quantized_weight.to(device=x.device) - if scales.device != x.device: - scales = scales.to(device=x.device) - else: - # Lazy cache: use weight tensor id as key (only for bf16/fp16 weights) - weight_id = id(weight) - - # Check cache - if weight_id in self._weight_cache: - quantized_weight, scales = self._weight_cache[weight_id] - # Ensure cached tensors are on the correct device - if quantized_weight.device != x.device: - quantized_weight = quantized_weight.to(device=x.device) - scales = scales.to(device=x.device) - else: - # Quantize weight and cache it - quantized_weight, scales = self.quantize_weight_for_kernel(weight, device=x.device) - # Cache the quantized weight and scales - self._weight_cache[weight_id] = (quantized_weight, scales) - - # Speed-first option: - # Using the TileLang kernel can be slower than cuBLAS BF16 GEMM for small/typical decode shapes. - # If enabled, we dequantize once and reuse a cached bf16 weight for F.linear (cuBLAS). - # This trades extra GPU memory for throughput. - if os.getenv("DIFFULEX_W8A16_PREFER_CUBLAS", "0") == "1": - # Key by the actual weight object we received (bf16 Parameter or int8 buffer). - deq_key = id(weight) - deq_w = self._dequant_weight_cache.get(deq_key) - if deq_w is None or deq_w.device != x.device: - # Dequantize: int8[N,K] * scales[N] -> bf16[N,K] - s = scales - if s.dim() == 1: - s = s.unsqueeze(-1) - deq_w = (quantized_weight.to(torch.float32) * s.to(torch.float32)).to(torch.bfloat16) - self._dequant_weight_cache[deq_key] = deq_w - return F.linear(x, deq_w, bias) - - # Try to use TileLang kernel if available - if _TILELANG_AVAILABLE and w8a16_gemm is not None: - try: - # Check device - if x.device.type != 'cuda': - return self._fallback_python_forward(x, quantized_weight, scales, bias) - - # Check CUDA compute capability (skip kernel if unsupported) - # sm_89 (Hopper) requires CUDA 11.8+, sm_90+ requires CUDA 12.0+ - # If CUDA toolkit doesn't support the GPU architecture, skip kernel attempt - try: - if torch.cuda.is_available(): - props = torch.cuda.get_device_properties(x.device.index or 0) - compute_cap = (props.major, props.minor) - # sm_89 requires CUDA 11.8+, sm_90+ requires CUDA 12.0+ - # For now, we'll let TileLang handle the check and fallback gracefully - # This is a conservative approach - we try the kernel and let it fail gracefully - pass - except Exception: - # If we can't check compute capability, still try the kernel - pass - - # Get shapes - M, K = x.shape - N, K_w = quantized_weight.shape - assert K == K_w, f"K dimension mismatch: {K} != {K_w}" - self._rt_on_call(m=M, n=N, k=K) - - # Reduce TileLang JIT compilation churn without killing small-M decode performance. - # Previous logic padded *any* M!=1 to 64/128/256, which can turn decode M=2/4 into M=64. - # We instead bucket to a small stable set: - # - for M<=64: next power-of-two (2,4,8,16,32,64) - # - for M>64: round up to a multiple of 64 - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - else: - M_bucket = 1 - - # TileLang MMA GEMM requires M divisible by 16. - # For decode small-M (1/2/4/8), pad minimally to 16 (much cheaper than padding to 64). - if M_bucket < 16: - M_bucket = 16 - - x_for_kernel = x - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype) - x_pad[:M, :] = x - x_for_kernel = x_pad - - # Choose a small-M friendly block_M to reduce wasted work in decode. - # Keep variants bounded to avoid compilation churn and satisfy MMA constraints: - # use only {16, 32, 64} so M is always divisible by 16. - if M_bucket <= 16: - block_m = 16 - elif M_bucket <= 32: - block_m = 32 - else: - block_m = 64 - - # TileLang autotune: use warmup + config cache pattern - # NOTE: fused-bias kernel currently regresses decode throughput significantly on typical workloads. - # Keep it disabled by default; can be enabled for experimentation. - fuse_bias = os.getenv("DIFFULEX_W8A16_FUSE_BIAS", "0") == "1" - use_bias_kernel = fuse_bias and (bias is not None) and (w8a16_gemm_bias is not None) - - cache_key = (str(x.device), M_bucket, N, K) - config = self._tl_autotune_config_cache.get(cache_key) - - if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None: - # Warmup phase: run autotune with real inputs - try: - if use_bias_kernel: - b_key = id(bias) - b = self._bias_f16_cache.get(b_key) - if b is None or b.device != x.device: - b = bias.to(device=x.device, dtype=torch.float16) - self._bias_f16_cache[b_key] = b - with set_autotune_inputs([x_for_kernel, quantized_weight, scales, b]): - kernel = w8a16_gemm_bias(M_bucket, N, K) - else: - with set_autotune_inputs([x_for_kernel, quantized_weight, scales]): - kernel = w8a16_gemm(M_bucket, N, K) - config = kernel.config - self._tl_autotune_config_cache[cache_key] = config - except Exception: - # Fallback to default config if autotune fails - config = None - - # Use cached config or default parameters - if config is not None: - if use_bias_kernel: - kernel = w8a16_gemm_bias(M_bucket, N, K, **config) - else: - kernel = w8a16_gemm(M_bucket, N, K, **config) - else: - # Default config (backward compatible) - if use_bias_kernel: - kernel = w8a16_gemm_bias( - M_bucket, - N, - K, - block_M=block_m, - block_N=64, - block_K=128, - num_stages=2, - threads=128, - ) - else: - kernel = w8a16_gemm( - M_bucket, - N, - K, - block_M=block_m, - block_N=64, - block_K=128, - num_stages=2, - threads=128, - ) - - # Call kernel - out_idx=[3] means output is the 4th parameter, - # so we only pass inputs (x, quantized_weight, scales), and kernel returns output - tag_kernel = os.getenv("DIFFULEX_PROFILE_TAG_W8A16", "0") == "1" - tag_name = ( - f"{'w8a16_gemm_bias' if use_bias_kernel else 'w8a16_gemm'}" - f"[M={M} Mb={M_bucket} N={N} K={K} bm={block_m} bn=64 bk=128 st=2 th=128]" - ) - if use_bias_kernel: - # out_idx=[4] -> output is 5th arg (returned). Inputs: A, B, Scales, Bias - # NOTE: kernel expects fp16 bias (see kernel signature). - b_key = id(bias) - b = self._bias_f16_cache.get(b_key) - if b is None or b.device != x.device: - b = bias.to(device=x.device, dtype=torch.float16) - self._bias_f16_cache[b_key] = b - if tag_kernel: - with torch.profiler.record_function(tag_name): - output_full = kernel(x_for_kernel, quantized_weight, scales, b) - else: - output_full = kernel(x_for_kernel, quantized_weight, scales, b) - else: - if tag_kernel: - with torch.profiler.record_function(tag_name): - output_full = kernel(x_for_kernel, quantized_weight, scales) - else: - output_full = kernel(x_for_kernel, quantized_weight, scales) - output = output_full[:M, :] if M_bucket != M else output_full - - # Add bias if present - if (bias is not None) and (not use_bias_kernel): - output = output + bias - - return output - except Exception as e: - # Fallback to Python implementation on any error - # This includes kernel compilation errors, execution errors, etc. - import warnings - error_msg = str(e) - - # Extract meaningful error information - # Check for common error types - if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg): - # CUDA architecture not supported - import re - arch_match = re.search(r"sm_(\d+)", error_msg) - if arch_match: - arch = arch_match.group(1) - error_msg = f"CUDA architecture sm_{arch} not supported by current CUDA toolkit" - else: - error_msg = "CUDA architecture not supported by current CUDA toolkit" - elif 'Compilation error' in error_msg: - # Extract the actual error after "Compilation error:" - idx = error_msg.find('Compilation error') - after = error_msg[idx + len('Compilation error'):] - # Find the first meaningful error line - lines = after.split('\n') - for line in lines: - line = line.strip() - if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()): - error_msg = f"CUDA compilation error: {line[:200]}" - break - else: - error_msg = "CUDA compilation error (see logs for details)" - elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower(): - # Pipeline stages mismatch - import re - match = re.search(r'Got (\d+) stages and (\d+) pipeline stages', error_msg) - if match: - error_msg = f"Pipeline stages mismatch: detected {match.group(1)} stages, expected {match.group(2)}" - else: - error_msg = "Pipeline stages configuration error" - else: - # Truncate very long error messages (like CUDA source code) - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - - # Only warn for unexpected errors - # For known issues (like unsupported CUDA architecture), silently fallback - # This prevents spam warnings when the environment doesn't support the kernel - if 'CUDA architecture not supported' in error_msg or 'sm_' in error_msg: - # Silently fallback for unsupported architectures (expected in some environments) - # The Python fallback is fully functional, so this is acceptable - pass - elif 'Pipeline stages' in error_msg: - # Pipeline stages mismatch - this might be fixable, but for now silently fallback - pass - else: - # Warn for unexpected errors that might indicate a real problem - warnings.warn( - f"TileLang kernel failed, falling back to Python implementation: {error_msg}", - UserWarning, - ) - # Count fallback and expose reason (opt-in). - try: - m, k = x.shape - n = int(quantized_weight.shape[0]) - except Exception: - m, n, k = -1, -1, -1 - self._rt_on_fallback(m=m, n=n, k=k, reason=error_msg) - return self._fallback_python_forward(x, quantized_weight, scales, bias) - else: - # TileLang not available, use Python reference - return self._fallback_python_forward(x, quantized_weight, scales, bias) - - def _fallback_python_forward( - self, - x: torch.Tensor, - quantized_weight: torch.Tensor, - scales: torch.Tensor, - bias: Optional[torch.Tensor], - ) -> torch.Tensor: - """Fallback Python implementation: dequantize + F.linear.""" - # Dequantize for reference implementation - dequantized_weight = self.dequantize(quantized_weight, scales) - - # Compute linear output - return F.linear(x, dequantized_weight, bias) - - def clear_cache(self) -> None: - """Clear the weight quantization cache. - - Useful for memory management or when weights are updated (e.g., fine-tuning). - """ - self._weight_cache.clear() - self._dequant_weight_cache.clear() - diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py index f677e11..52e92ed 100644 --- a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py +++ b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py @@ -1,125 +1,35 @@ """ -W8A8 Linear quantization strategy (int8 weight + int8 activation). +W8A8 Linear quantization strategy (int8 weight + int8 activation), TileLang-free. -Implementation notes: -- We keep per-output-channel weight scales (same as W8A16). -- We quantize activations per-row (per token) to int8 and keep per-row scales. -- GEMM uses `torch._int_mm` (int8 x int8 -> int32) when available. - This op has a small-M constraint on some builds (e.g. M must be > 16), so we pad M minimally. -- If int8 GEMM is not available, we fall back to dequantized BF16 + cuBLAS (F.linear). +Implementation (vLLM-aligned): +- Activation quantization: `vllm._custom_ops.scaled_int8_quant` (dynamic per-token). +- GEMM+dequant: `vllm._custom_ops.cutlass_scaled_mm` (CUTLASS, with internal + triton fallback depending on shape/platform) — no `F.linear` slow path. + +Notes: +- Weight is stored as int8 in **K×N** layout (transposed), matching vLLM CUTLASS + kernels. +- Weight scale is stored as **[1, N]** float32 for broadcasting. """ from __future__ import annotations from typing import Any, Optional -import os -import warnings - -import torch -import torch.nn.functional as F +import torch # type: ignore -from diffulex.attention.metadata import is_warming_up from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -try: - from diffulex_kernel.python.linear_kernels import ( - w8a8_gemm, - w8a8_scaled_gemm, - w8a8_act_quant, - w8a8_fused_act_gemm, - ) - _TILELANG_AVAILABLE = True -except ImportError: - _TILELANG_AVAILABLE = False - w8a8_gemm = None - w8a8_scaled_gemm = None - w8a8_act_quant = None - w8a8_fused_act_gemm = None - -try: - # Optional: only needed for TileLang autotune warmup. - from tilelang.autotuner import set_autotune_inputs # type: ignore -except Exception: - set_autotune_inputs = None - - -_DEFAULT_TL_LINEAR_CFG: dict[str, Any] = { - "block_M": 64, - "block_N": 64, - "block_K": 128, - "num_stages": 2, - "threads": 128, -} - - -def _quantize_per_row_int8_torch(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Per-row symmetric int8 quantization. - - Returns: - x_q: int8 [M, K] - x_scales: float32 [M] where dequant is x_q.float() * x_scales[:, None] - """ - # x: [M, K] - abs_max = x.abs().amax(dim=-1, keepdim=False) # [M] - scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float32) # [M] - x_q = torch.round(x.to(torch.float32) / scales.unsqueeze(-1)).clamp(-127, 127).to(torch.int8) - return x_q, scales - - -def _quantize_per_row_int8(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Per-row symmetric int8 quantization with optional TileLang fused kernel. - - Default: use TileLang fused kernel if available, otherwise fall back to torch ops. - - Env: - - DIFFULEX_W8A8_USE_TL_ACT_QUANT=0 to force torch fallback. - """ - use_tl = os.getenv("DIFFULEX_W8A8_USE_TL_ACT_QUANT", "1") == "1" - if ( - use_tl - and _TILELANG_AVAILABLE - and (w8a8_act_quant is not None) - and x.is_cuda - and x.dtype == torch.bfloat16 - and x.is_contiguous() - and x.dim() == 2 - ): - m, k = x.shape - # Choose a small set of block_M values to reduce wasted work on decode small-M. - if m <= 16: - block_m = 16 - elif m <= 32: - block_m = 32 - else: - block_m = 64 - try: - kernel = w8a8_act_quant( - m, - k, - block_M=block_m, - block_K=256, - threads=128, - ) - x_q, scales = kernel(x) - return x_q, scales - except Exception: - # Fall back silently to torch path for robustness (e.g., unsupported arch/toolchain). - pass - return _quantize_per_row_int8_torch(x) - -def _int8_mm(a_int8: torch.Tensor, b_int8: torch.Tensor) -> torch.Tensor: - """int8 GEMM -> int32. - - We prefer `torch._int_mm` when present. - """ - if hasattr(torch, "_int_mm"): - return torch._int_mm(a_int8, b_int8) - if hasattr(torch.ops.aten, "_int_mm"): - return torch.ops.aten._int_mm(a_int8, b_int8) - raise RuntimeError("No int8 GEMM backend found (torch._int_mm / aten._int_mm missing)") +def _require_vllm_ops(): + try: + from vllm import _custom_ops as ops # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "W8A8 需要 vLLM 的 CUDA 自定义算子(vllm._custom_ops)。" + ) from e + return ops @register_linear_strategy(weight_dtype="int8", act_dtype="int8") @@ -128,18 +38,10 @@ def _build_linear_int8_w8a8() -> LinearQuantizationStrategy: class LinearInt8W8A8Strategy(LinearQuantizationStrategy): - """W8A8 Linear strategy: int8 weight + int8 activation, output bf16.""" - - def __init__(self): + def __init__(self) -> None: super().__init__() - # weight_id -> (qweight_int8[N,K], scales_bf16[N]) + # Cache: id(weight) -> (qweight_int8 [N,K], w_scales_fp32 [N]) self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} - # weight_id -> qweight_t_int8[K,N] (for torch._int_mm) - self._weight_t_cache: dict[int, torch.Tensor] = {} - # speed-first option (uses extra memory) - self._dequant_weight_cache: dict[int, torch.Tensor] = {} - # (device_index, M_bucket, N, K) -> TileLang config dict for fused kernel - self._tl_fused_cfg_cache: dict[tuple[int, int, int, int], dict[str, Any]] = {} @property def name(self) -> str: @@ -156,52 +58,49 @@ def linear_act_format(self) -> str: def get_storage_dtype(self) -> tuple[torch.dtype, int]: return torch.int8, 1 - def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]: - """Return shape of scales tensor for per-channel quantization. - - For [out_features, in_features] weight, scales shape is [out_features]. - """ + def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]: _ = kwargs - if len(original_shape) < 2: - raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}") - # Per-output-channel: scales shape is [out_features] + if len(original_shape) != 2: + raise ValueError(f"Expected 2D weight [N,K], got {original_shape}") return (original_shape[0],) - def clear_cache(self) -> None: - self._weight_cache.clear() - self._weight_t_cache.clear() - self._dequant_weight_cache.clear() - self._tl_fused_cfg_cache.clear() - - def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]: + def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]: _ = kwargs - # Per-output-channel symmetric quantization: scales shape [N] - abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0] # [N, 1] - # Keep scales in fp16 to reduce scale quantization error (A8 paths are sensitive). - scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float16) # [N, 1] - q = torch.round(tensor / scales).clamp(-128, 127).to(torch.int8) - return q, scales.squeeze(-1) - - def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor: - _ = kwargs - scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata - if scales is None: - raise ValueError("scales required for dequantization") - if scales.dim() == 1: - scales = scales.unsqueeze(-1) # [N, 1] - return (quantized.to(torch.float32) * scales.to(torch.float32)).to(torch.bfloat16) + if tensor.dim() != 2: + raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}") + # per-output-channel symmetric int8, store K×N for cutlass_scaled_mm + w = tensor.to(torch.float32) + abs_max = w.abs().amax(dim=-1, keepdim=False) # [N] + scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float32) # [N] + q_nk = torch.round(w / scales.unsqueeze(-1)).clamp(-127, 127).to(torch.int8) # [N,K] + # NOTE: vLLM CUTLASS scaled_mm expects b.stride(0) == 1, which is true + # for a transpose-view (non-contiguous) but not for a contiguous K×N tensor. + q_kn = q_nk.t() # [K,N], stride(0)==1 + scale_b = scales.unsqueeze(0).contiguous() # [1,N] + return q_kn, {"scales": scale_b} def quantize_weight_for_kernel( self, weight: torch.Tensor, *, device: torch.device | None = None, - **kwargs: Any, + **_: Any, ) -> tuple[torch.Tensor, Any]: - _ = kwargs + # Return int8 K×N weights + fp32 [1,N] scales for vLLM CUTLASS path. + q_kn, meta = self.quantize(weight) if device is not None: - weight = weight.to(device=device) - return self.quantize(weight) + q_kn = q_kn.to(device=device) + meta["scales"] = meta["scales"].to(device=device) + return q_kn, meta["scales"] + + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor: + _ = kwargs + scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata + if scales is None: + raise ValueError("scales required for dequantization") + raise RuntimeError( + "W8A8 不提供 dequantize 路径(避免走慢的 bf16 GEMM)。" + ) def linear_forward( self, @@ -214,262 +113,44 @@ def linear_forward( ) -> torch.Tensor: _ = quant_kind - quant_scales = kwargs.pop("quant_scales", None) + ops = _require_vllm_ops() - # Resolve / cache quantized weight + scales - if weight.dtype == torch.int8: - if quant_scales is None: - raise ValueError("weight is int8 but quant_scales is None; expected per-channel scales tensor") - qweight = weight if weight.device == x.device else weight.to(device=x.device) - w_scales = quant_scales - # Prefer fp16 scales for quality (and fused kernel expects fp16 scales). - if w_scales.dtype != torch.float16: - w_scales = w_scales.to(dtype=torch.float16) - if w_scales.device != x.device: - w_scales = w_scales.to(device=x.device) - weight_id = id(weight) + # If weight already quantized by LinearBase.load-time quantization. + quant_scales = kwargs.get("quant_scales", None) + if weight is not None and weight.dtype == torch.int8 and quant_scales is not None: + # Expected: qweight is K×N int8, quant_scales is [1,N] fp32 + qweight = weight.to(device=x.device) + w_scales = quant_scales.to(device=x.device, dtype=torch.float32) else: - weight_id = id(weight) - cached = self._weight_cache.get(weight_id) - if cached is None: - qweight, w_scales = self.quantize_weight_for_kernel(weight, device=x.device) - self._weight_cache[weight_id] = (qweight, w_scales) + wid = id(weight) + cached = self._weight_cache.get(wid) + if cached is None or cached[0].device != x.device: + qweight, meta = self.quantize(weight) + qweight = qweight.to(device=x.device) + w_scales = meta["scales"].to(device=x.device, dtype=torch.float32) + self._weight_cache[wid] = (qweight, w_scales) else: qweight, w_scales = cached - if qweight.device != x.device: - qweight = qweight.to(device=x.device) - w_scales = w_scales.to(device=x.device) - self._weight_cache[weight_id] = (qweight, w_scales) - - # Optional: use cuBLAS BF16 (dequant once) - if os.getenv("DIFFULEX_W8A8_PREFER_CUBLAS", "0") == "1": - deq_key = weight_id - deq_w = self._dequant_weight_cache.get(deq_key) - if deq_w is None or deq_w.device != x.device: - s = w_scales - if s.dim() == 1: - s = s.unsqueeze(-1) - deq_w = (qweight.to(torch.float32) * s.to(torch.float32)).to(torch.bfloat16) - self._dequant_weight_cache[deq_key] = deq_w - return F.linear(x, deq_w, bias) - - # Quantize activation per-row - if x.dtype not in (torch.bfloat16, torch.float16, torch.float32): - x = x.to(torch.bfloat16) - if x.dtype != torch.bfloat16: - x = x.to(torch.bfloat16) - - # Try TileLang fused quant + GEMM first (bf16 activation input). - use_fused = os.getenv("DIFFULEX_W8A8_USE_TL_FUSED_GEMM", "1") == "1" - if ( - use_fused - and _TILELANG_AVAILABLE - and (w8a8_fused_act_gemm is not None) - and x.is_cuda - and x.dtype == torch.bfloat16 - and x.dim() == 2 - and x.is_contiguous() - ): - try: - M, K = x.shape - N, K_w = qweight.shape - assert K == K_w, f"K dimension mismatch: {K} != {K_w}" - - # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16) - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_for_kernel = x - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.bfloat16) - x_pad[:M, :] = x - x_for_kernel = x_pad - - dev_idx = x.device.index or 0 - cfg_key = (dev_idx, M_bucket, N, K) - cfg = self._tl_fused_cfg_cache.get(cfg_key) - kernel = None - - # Only run autotune during warmup when autotuner inputs are available. - if cfg is None and is_warming_up() and set_autotune_inputs is not None: - try: - with set_autotune_inputs([x_for_kernel, qweight, w_scales]): - kernel = w8a8_fused_act_gemm(M_bucket, N, K) - # Only cache config if autotune succeeded (kernel has valid config) - if hasattr(kernel, 'config') and kernel.config is not None: - cfg = kernel.config - self._tl_fused_cfg_cache[cfg_key] = cfg - except Exception as autotune_err: - # Autotune failed (e.g., all configs failed to compile), use default - autotune_msg = str(autotune_err) - if len(autotune_msg) > 150: - autotune_msg = autotune_msg[:150] + "..." - warnings.warn( - f"W8A8 fused autotune failed ({autotune_msg}), using default config", - UserWarning, - ) - kernel = None - - # Non-warmup path: keep deterministic behavior with a default config. - if cfg is None: - cfg = _DEFAULT_TL_LINEAR_CFG - - if kernel is None: - kernel = w8a8_fused_act_gemm(M_bucket, N, K, **cfg) - out_full = kernel(x_for_kernel, qweight, w_scales) - out = out_full[:M, :] if M_bucket != M else out_full - if bias is not None: - out = out + bias - return out - except Exception as e: - error_msg = str(e) - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - warnings.warn( - f"W8A8 fused quant GEMM failed, falling back to quantize+GEMM: {error_msg}", - UserWarning, - ) - - # Step-local cache for activation quantization (reuse within one step for QKV/gate-up, etc.) - use_cache = os.getenv("DIFFULEX_W8A8_ACT_QUANT_CACHE", "1") == "1" - cached = None - if use_cache: - try: - from diffulex.utils.quantization.context import get_cached_act_quant, set_cached_act_quant - cached = get_cached_act_quant(x) - except Exception: - cached = None - if cached is not None: - x_q, x_scales = cached - else: - x_q, x_scales = _quantize_per_row_int8(x) - if use_cache: - try: - set_cached_act_quant(x, x_q, x_scales) - except Exception: - pass - if x_q.device != x.device: - x_q = x_q.to(device=x.device) - x_scales = x_scales.to(device=x.device) - - # Get shapes - M, K = x_q.shape - N, K_w = qweight.shape - assert K == K_w, f"K dimension mismatch: {K} != {K_w}" - - # Try TileLang kernel first if available - if _TILELANG_AVAILABLE and (w8a8_scaled_gemm is not None or w8a8_gemm is not None): - try: - # Check device - if x.device.type != 'cuda': - # Fall through to _int8_mm fallback - pass - else: - # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16) - M_bucket = M - if M > 1: - if M <= 64: - M_bucket = 1 << (M - 1).bit_length() - else: - M_bucket = ((M + 63) // 64) * 64 - - x_q_for_kernel = x_q - if M_bucket != M: - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8) - x_pad[:M, :] = x_q - x_q_for_kernel = x_pad - x_scales_pad = torch.zeros((M_bucket,), device=x.device, dtype=torch.float32) - x_scales_pad[:M] = x_scales.to(torch.float32) - x_scales_for_kernel = x_scales_pad - else: - x_scales_for_kernel = x_scales.to(torch.float32) - - # Prefer fused-scale kernel: outputs bf16 directly, avoiding large int32->fp32 postprocessing. - if w8a8_scaled_gemm is not None: - kernel = w8a8_scaled_gemm( - M_bucket, - N, - K, - block_M=64, - block_N=64, - block_K=128, - num_stages=2, - threads=128, - ) - out_full = kernel(x_q_for_kernel, qweight, x_scales_for_kernel, w_scales) - out = out_full[:M, :] if M_bucket != M else out_full - else: - # Fallback to int32-output kernel + python scaling - kernel = w8a8_gemm( - M_bucket, - N, - K, - block_M=64, - block_N=64, - block_K=128, - num_stages=2, - threads=128, - ) - out_i32_full = kernel(x_q_for_kernel, qweight) - out_i32 = out_i32_full[:M, :] if M_bucket != M else out_i32_full - - out_fp32 = out_i32.to(torch.float32) - out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1) - out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0) - out = out_fp32.to(torch.bfloat16) - - if bias is not None: - out = out + bias - return out - except Exception as e: - # Fallback to _int8_mm on any kernel error - import warnings - error_msg = str(e) - if len(error_msg) > 200: - error_msg = error_msg[:200] + "..." - warnings.warn(f"W8A8 TileLang kernel failed, falling back to torch._int_mm: {error_msg}", UserWarning) - - # Fallback: use torch._int_mm - # Prepare weight transpose for int8 GEMM: [N,K] -> [K,N] - wt = self._weight_t_cache.get(weight_id) - if wt is None or wt.device != x.device: - wt = qweight.t().contiguous() - self._weight_t_cache[weight_id] = wt - - # Some builds require M > 16 for int8 GEMM; pad minimally. - if M <= 16: - M_bucket = 17 - x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8) - x_pad[:M, :] = x_q - x_q_for_mm = x_pad - else: - x_q_for_mm = x_q - - try: - out_i32_full = _int8_mm(x_q_for_mm, wt) # [M_bucket, N] int32 - except Exception as e: - # Fallback: dequant + BF16 GEMM - msg = str(e) - if len(msg) > 200: - msg = msg[:200] + "..." - warnings.warn(f"W8A8 int8 GEMM failed, falling back to BF16 F.linear: {msg}", UserWarning) - deq_w = self.dequantize(qweight, w_scales) - return F.linear(x, deq_w, bias) - - out_i32 = out_i32_full[:M, :] if M <= 16 else out_i32_full - - # Apply scales: int32 * x_scale[m] * w_scale[n] - out_fp32 = out_i32.to(torch.float32) - out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1) - out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0) - out = out_fp32.to(torch.bfloat16) - - if bias is not None: - out = out + bias - return out + # Flatten like torch.nn.functional.linear + orig_shape = x.shape + x2 = x.reshape(-1, x.shape[-1]) if x.dim() != 2 else x + if x2.dtype not in (torch.bfloat16, torch.float16): + x2 = x2.to(torch.bfloat16) + # dynamic per-token int8 quant + fused GEMM_DQ + x_q, x_s, _ = ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True) + y = ops.cutlass_scaled_mm( + x_q, + qweight, + scale_a=x_s, + scale_b=w_scales, + out_dtype=x2.dtype, + bias=bias.to(dtype=x2.dtype) if bias is not None else None, + ) + + if orig_shape == x2.shape: + return y + if x.dim() == 1: + return y.squeeze(0) + return y.reshape(*orig_shape[:-1], y.shape[-1]) diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py index 54eb97d..1cd8eb1 100644 --- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py +++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py @@ -1,19 +1,14 @@ -""" -Marlin-style (vLLM AllSpark) W8A16 Linear quantization strategy. - -Goal: -- Replace Diffulex current W8A16 path (TileLang kernel that casts int8->bf16 inside) - with a vLLM-like fused path for decode small-M: - - per-out-channel int8 quantization (stored as uint8 with +128 bias) - - one-time N32K16 reorder (AllSpark repack) - - fused dequant + GEMM kernel (AllSpark w8a16 gemm) - -Notes: -- Despite the filename mentioning "marlin", the actual fused kernel we vendor is - vLLM's AllSpark Ampere W8A16 fused GEMM, which is the effective INT8 W8A16 - fast path in vLLM for this use-case. -- Fallback behavior is critical: if the extension is unavailable, or shapes are - unsupported (e.g., K%16!=0), we fall back to existing TileLang W8A16 or BF16. +"""W8A16 Linear quantization strategy using vLLM custom ops. + +This strategy uses vLLM's fused AllSpark W8A16 path via `vllm._custom_ops`: +- per-out-channel int8 quantization stored as uint8 (+128 bias) +- one-time N32K16 reorder (AllSpark repack) +- fused dequant + GEMM (AllSpark w8a16 gemm) + +Important: +- We intentionally do NOT vendor/compile a local AllSpark/Marlin extension in + Diffulex anymore. If `vllm._custom_ops` is unavailable, this strategy fails + fast (instead of silently compiling or falling back to a slow/oom-prone path). """ from __future__ import annotations @@ -27,27 +22,37 @@ from diffulex.utils.quantization.registry import register_linear_strategy from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -# Optional: existing TileLang fallback (already used by linear_int8_w8a16.py) try: - from diffulex_kernel.python.linear_kernels import w8a16_gemm as _tilelang_w8a16_gemm - _TILELANG_AVAILABLE = True + import vllm._custom_ops as _vllm_ops except Exception: - _tilelang_w8a16_gemm = None - _TILELANG_AVAILABLE = False + _vllm_ops = None -# Vendored vLLM-style fused W8A16 (AllSpark) ops. -try: - from diffulex_kernel.python.marlin_ops import ( # noqa: F401 - allspark_w8a16_gemm as _allspark_w8a16_gemm, - rearrange_kn_weight_as_n32k16_order as _allspark_repack, - is_available as _allspark_is_available, + +def _allspark_is_available() -> bool: + return bool( + _vllm_ops is not None + and hasattr(_vllm_ops, "allspark_w8a16_gemm") + and hasattr(_vllm_ops, "allspark_repack_weight") ) -except Exception: - _allspark_w8a16_gemm = None - _allspark_repack = None - def _allspark_is_available() -> bool: - return False + +def _allspark_w8a16_gemm(*args, **kwargs): + if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_w8a16_gemm"): + raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_w8a16_gemm`.") + return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs) + + +def _allspark_repack_weight(b_qweight_kn: torch.Tensor, scales_1xn: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Repack KxN uint8 qweight + 1xN scales into (N_32,K) + (1,N_32) for AllSpark GEMM.""" + if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_repack_weight"): + raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_repack_weight`.") + q_reorder, s_reorder, _ = _vllm_ops.allspark_repack_weight( + b_qweight_kn, + scales_1xn, + None, + False, + ) + return q_reorder, s_reorder @register_linear_strategy(weight_dtype="marlin_int8", act_dtype="bf16") @@ -56,7 +61,7 @@ def _build_linear_marlin_int8_w8a16() -> LinearQuantizationStrategy: class LinearMarlinInt8W8A16Strategy(LinearQuantizationStrategy): - """W8A16 strategy using vendored vLLM AllSpark fused GEMM + repack.""" + """W8A16 strategy using vLLM custom ops (AllSpark fused GEMM + repack).""" def __init__(self) -> None: super().__init__() @@ -65,7 +70,10 @@ def __init__(self) -> None: @property def name(self) -> str: - return "linear_marlin_int8_w8a16" + # NOTE: Keep strategy naming consistent with the public W8A16 INT8 path. + # The underlying implementation is a Marlin/AllSpark-style fused kernel, + # but the user-facing strategy name should not be tied to a particular kernel brand. + return "linear_int8_w8a16" @property def linear_weight_format(self) -> str: @@ -148,44 +156,54 @@ def quantize_weight_for_kernel( abs_max = torch.abs(weight).max(dim=-1)[0] # [N] scales = (abs_max.clamp(min=1e-8) / 127.0).to(dtype=torch.bfloat16) # [N] - # Quantize to signed int8, then store as uint8 with +128 bias. - w_fp32 = weight.to(torch.float32) - s_fp32 = scales.to(torch.float32).unsqueeze(-1) # [N,1] - q_i8 = torch.round(w_fp32 / s_fp32).clamp(-128, 127).to(torch.int16) # [N,K] - q_u8 = (q_i8 + 128).to(torch.uint8) # [N,K] in [0,255] - - if not _allspark_is_available() or _allspark_repack is None: - # Fallback storage (no reorder). Keep [N,K] and [N]. + # IMPORTANT (OOM fix): + # Avoid allocating a full [N,K] fp32 copy (and an extra transpose buffer). + # Quantize in small row blocks and (when using AllSpark) write directly into + # the repack input layout B_kn=[K,N], so we never materialize q_u8 + transpose. + try: + block_n = int(os.getenv("DIFFULEX_W8A16_QUANT_BLOCK_N", "256")) + except Exception: + block_n = 256 + block_n = max(1, block_n) + + use_allspark = _allspark_is_available() + if use_allspark: + # AllSpark repack expects B in (K,N) contiguous layout. + b_kn = torch.empty((k, n), device=weight.device, dtype=torch.uint8) # [K,N] + for i in range(0, n, block_n): + j = min(i + block_n, n) + w_blk = weight[i:j, :] # [B,K] + s_blk = scales[i:j].unsqueeze(-1) # [B,1] + # Quantize to signed int in bf16 to minimize temporary memory. + q_i16 = torch.round(w_blk / s_blk).clamp(-128, 127).to(torch.int16) # [B,K] + q_u8_blk = (q_i16 + 128).to(torch.uint8) # [B,K] + # Write directly into [K,N] buffer. + b_kn[:, i:j] = q_u8_blk.transpose(0, 1) + else: + # Fallback storage (no reorder). Keep [N,K] and [N] (padded to N_32). # Note: forward will detect unavailable allspark and fallback further. + q_pad = torch.full((n_32, k), 128, device=weight.device, dtype=torch.uint8) + for i in range(0, n, block_n): + j = min(i + block_n, n) + w_blk = weight[i:j, :] # [B,K] + s_blk = scales[i:j].unsqueeze(-1) # [B,1] + q_i16 = torch.round(w_blk / s_blk).clamp(-128, 127).to(torch.int16) # [B,K] + q_pad[i:j, :] = (q_i16 + 128).to(torch.uint8) if n_32 != n: - q_pad = torch.full((n_32, k), 128, device=q_u8.device, dtype=torch.uint8) - q_pad[:n, :] = q_u8 s_pad = torch.zeros((n_32,), device=scales.device, dtype=torch.bfloat16) s_pad[:n] = scales return q_pad.contiguous(), s_pad.contiguous() - return q_u8.contiguous(), scales.contiguous() - - # AllSpark repack expects B in (K,N) contiguous layout. - b_kn = q_u8.transpose(0, 1).contiguous() # [K,N] - - q_reorder = torch.empty((n_32, k), device=b_kn.device, dtype=torch.uint8) - s_reorder = torch.empty((n_32,), device=scales.device, dtype=torch.bfloat16) + return q_pad[:n, :].contiguous(), scales.contiguous() - # No zero-point path for symmetric signed int8 (bias128 already handled). - _allspark_repack( - b_kn, - scales.contiguous(), - None, - False, # has_zp - q_reorder, - s_reorder, - None, - int(k), - int(n), - int(n_32), + # vLLM expects scales in [1, N] layout for repack. + q_reorder, s_reorder_1xn = _allspark_repack_weight( + b_kn.contiguous(), + scales.unsqueeze(0).contiguous(), ) - return q_reorder.contiguous(), s_reorder.contiguous() + # Store scales as 1D for LinearBase buffers; linear_forward will reshape as needed. + s_1d = s_reorder_1xn.reshape(-1).to(dtype=torch.bfloat16) + return q_reorder.contiguous(), s_1d.contiguous() def quantize_act_for_kernel( self, @@ -254,9 +272,15 @@ def linear_forward( else: qweight, scales = cached - # If fused kernel isn't available, fall back to TileLang or BF16. - if _allspark_w8a16_gemm is None or not _allspark_is_available(): - return self._fallback(x, weight, qweight, scales, bias) + # If fused kernel isn't available, fall back to BF16 only if original weight exists; + # otherwise fail fast (do NOT dequantize a full matrix, which is memory-prohibitive). + if not _allspark_is_available(): + if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16): + return F.linear(x, weight, bias) + raise RuntimeError( + "vLLM AllSpark W8A16 fused kernel is unavailable, and bf16 weight is not present. " + "Please ensure vLLM custom ops are installed and loadable (`import vllm._custom_ops`)." + ) # AllSpark kernel requires CUDA and contiguous inputs. if x2.device.type != "cuda": @@ -283,10 +307,12 @@ def linear_forward( sm_count, sm_version = self._get_sm_info(x2.device) cublas_thr = self._cublas_m_threshold() + # vLLM allspark expects scales as 1xN (or equivalent contiguous view). + scales_1xn = scales.reshape(1, -1).contiguous() y2 = _allspark_w8a16_gemm( x2.contiguous(), qweight.contiguous(), - scales.contiguous(), + scales_1xn, None, # b_qzeros n, -1, # group_size (only supports -1) @@ -308,49 +334,6 @@ def linear_forward( y = y2.reshape(*orig_shape[:-1], y2.shape[-1]) return y - def _fallback( - self, - x: torch.Tensor, - weight: torch.Tensor, - qweight: torch.Tensor, - scales: torch.Tensor, - bias: Optional[torch.Tensor], - ) -> torch.Tensor: - # Prefer existing TileLang W8A16 if available and inputs are CUDA. - if _TILELANG_AVAILABLE and _tilelang_w8a16_gemm is not None and x.device.type == "cuda": - try: - x2 = x if x.dim() == 2 else x.reshape(-1, x.shape[-1]) - # TileLang expects int8 weight. If our qweight is uint8 bias128, convert to int8 on the fly. - if qweight.dtype == torch.uint8: - q_i8 = (qweight.to(torch.int16) - 128).to(torch.int8) - else: - q_i8 = qweight - y2 = _tilelang_w8a16_gemm(x2, q_i8, scales, False) - if bias is not None: - y2 = y2 + bias - if x.dim() == 2: - return y2 - if x.dim() == 1: - return y2.squeeze(0) - return y2.reshape(*x.shape[:-1], y2.shape[-1]) - except Exception: - pass - - # Last resort: BF16 F.linear using dequantized weight if bf16 is available. - if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16): - return F.linear(x, weight, bias) - - # Dequantize from qweight + scales and use cuBLAS via F.linear. - # qweight may be [N_32,K] or reordered; we cannot reliably undo reorder here. - # So only attempt this if qweight looks like plain [N,K] (no padding). - if qweight.dim() == 2 and scales.dim() == 1 and qweight.shape[0] == scales.shape[0]: - if qweight.dtype == torch.uint8: - q = (qweight.to(torch.int16) - 128).to(torch.int8) - else: - q = qweight - s = scales.unsqueeze(-1).to(torch.float32) - w_deq = (q.to(torch.float32) * s).to(torch.bfloat16) - return F.linear(x, w_deq, bias) - - raise RuntimeError("AllSpark/TileLang unavailable and safe fallback path not found for marlin_int8 W8A16.") + # NOTE: We intentionally do not provide a generic dequantize+F.linear fallback for reordered weights. + # It materializes a full bf16 matrix and is prone to OOM on large models. diff --git a/diffulex_bench/configs/awq_bf16kv_varlen.yml b/diffulex_bench/configs/awq_bf16kv_varlen.yml new file mode 100644 index 0000000..62c2cb8 --- /dev/null +++ b/diffulex_bench/configs/awq_bf16kv_varlen.yml @@ -0,0 +1,47 @@ +# AWQ (W4A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-awq" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "awq" + linear_mlp_weight_dtype: "awq" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/awq_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml new file mode 100644 index 0000000..8c76f4e --- /dev/null +++ b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml @@ -0,0 +1,48 @@ +# AWQ Marlin (W4, A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-awq" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: AWQ Marlin + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "awq_marlin" + linear_mlp_weight_dtype: "awq_marlin" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/awq_marlin_bf16kv" + save_results: true + use_tqdm: true + diff --git a/diffulex_bench/configs/fp8_bf16kv_varlen.yml b/diffulex_bench/configs/fp8_bf16kv_varlen.yml new file mode 100644 index 0000000..2ac105b --- /dev/null +++ b/diffulex_bench/configs/fp8_bf16kv_varlen.yml @@ -0,0 +1,48 @@ +# FP8 Linear (vLLM) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: FP8 weights (vLLM ops) + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "fp8" + linear_mlp_weight_dtype: "fp8" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/fp8_bf16kv" + save_results: true + use_tqdm: true + diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_bf16kv_varlen.yml new file mode 100644 index 0000000..b7fd14d --- /dev/null +++ b/diffulex_bench/configs/gptq_bf16kv_varlen.yml @@ -0,0 +1,47 @@ +# GPTQ (W4A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq" + linear_mlp_weight_dtype: "gptq" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_bf16kv" + save_results: true + use_tqdm: true \ No newline at end of file diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml new file mode 100644 index 0000000..1505192 --- /dev/null +++ b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml @@ -0,0 +1,47 @@ +# GPTQ (W4A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 2 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq" + linear_mlp_weight_dtype: "gptq" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_bf16kv" + save_results: true + use_tqdm: true \ No newline at end of file diff --git a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml new file mode 100644 index 0000000..858b31a --- /dev/null +++ b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml @@ -0,0 +1,48 @@ +# GPTQ Marlin (W4/W8, A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ Marlin + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq_marlin" + linear_mlp_weight_dtype: "gptq_marlin" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_marlin_bf16kv" + save_results: true + use_tqdm: true + diff --git a/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml new file mode 100644 index 0000000..bae9875 --- /dev/null +++ b/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml @@ -0,0 +1,47 @@ +# GPTQ Marlin (W2, A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ Marlin (W2) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq_marlin" + linear_mlp_weight_dtype: "gptq_marlin" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_marlin_w2_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml new file mode 100644 index 0000000..f8265d3 --- /dev/null +++ b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml @@ -0,0 +1,47 @@ +# GPTQ Marlin (W4, A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq_marlin" + linear_mlp_weight_dtype: "gptq_marlin" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_marlin_w4_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml new file mode 100644 index 0000000..e20c9be --- /dev/null +++ b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml @@ -0,0 +1,47 @@ +# GPTQ Marlin (W8, A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq_marlin" + linear_mlp_weight_dtype: "gptq_marlin" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_marlin_w8_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml new file mode 100644 index 0000000..03fe3e7 --- /dev/null +++ b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml @@ -0,0 +1,47 @@ +# GPTQ (W2A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ (W2A16) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq" + linear_mlp_weight_dtype: "gptq" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_w2_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml new file mode 100644 index 0000000..1f68616 --- /dev/null +++ b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml @@ -0,0 +1,47 @@ +# GPTQ (W8A16) + BF16 KV Cache (varlen mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w8" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 2048 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true # Required for varlen mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ (W8A16) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "varlen" + linear_attn_weight_dtype: "gptq" + linear_mlp_weight_dtype: "gptq" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_varlen/gptq_w8_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_kernel/__init__.py b/diffulex_kernel/__init__.py index 8a47168..38ab37d 100644 --- a/diffulex_kernel/__init__.py +++ b/diffulex_kernel/__init__.py @@ -1,6 +1,54 @@ -from diffulex_kernel.python.dllm_flash_attn_kernels import dllm_flash_attn_decode, dllm_flash_attn_prefill -from diffulex_kernel.python.kv_cache_kernels import ( - store_kvcache_distinct_layout, - store_kvcache_unified_layout, - load_kvcache, -) +"""Diffulex CUDA kernel package. + +Keep this module lightweight: importing `diffulex_kernel` should not eagerly +import optional heavy deps (e.g. TileLang) unless the corresponding kernels are +actually used. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from diffulex_kernel.python.dllm_flash_attn_kernels import ( # noqa: F401 + dllm_flash_attn_decode as dllm_flash_attn_decode, + dllm_flash_attn_prefill as dllm_flash_attn_prefill, + ) + from diffulex_kernel.python.kv_cache_kernels import ( # noqa: F401 + load_kvcache as load_kvcache, + store_kvcache_distinct_layout as store_kvcache_distinct_layout, + store_kvcache_unified_layout as store_kvcache_unified_layout, + ) + + +def __getattr__(name: str): + if name == "dllm_flash_attn_decode": + from diffulex_kernel.python.dllm_flash_attn_kernels import dllm_flash_attn_decode + + return dllm_flash_attn_decode + if name == "dllm_flash_attn_prefill": + from diffulex_kernel.python.dllm_flash_attn_kernels import dllm_flash_attn_prefill + + return dllm_flash_attn_prefill + if name == "store_kvcache_distinct_layout": + from diffulex_kernel.python.kv_cache_kernels import store_kvcache_distinct_layout + + return store_kvcache_distinct_layout + if name == "store_kvcache_unified_layout": + from diffulex_kernel.python.kv_cache_kernels import store_kvcache_unified_layout + + return store_kvcache_unified_layout + if name == "load_kvcache": + from diffulex_kernel.python.kv_cache_kernels import load_kvcache + + return load_kvcache + raise AttributeError(name) + + +__all__ = [ + "dllm_flash_attn_decode", + "dllm_flash_attn_prefill", + "store_kvcache_distinct_layout", + "store_kvcache_unified_layout", + "load_kvcache", +] diff --git a/diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu b/diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu deleted file mode 100644 index 1b408d5..0000000 --- a/diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu +++ /dev/null @@ -1,542 +0,0 @@ -#include "allspark_utils.cuh" -#include -#include - -// NOTE: This file is vendored (with minimal modifications) from -// vLLM `csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu`. -// We remove vLLM's registration macros and expose the entrypoint via -// a local PyTorch extension binding in `torch_bindings_marlin.cpp`. - -at::Tensor as_g_workspace; - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - -torch::Tensor allspark_w8a16_gemm( - torch::Tensor const& a, torch::Tensor const& b_qweight, - torch::Tensor const& b_scales, c10::optional const& b_qzeros, - int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version, - int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) { - TORCH_CHECK_NOT_IMPLEMENTED( - false, "allspark_w8a16_gemm(..) requires CUDA_ARCH >= 8.0"); - return torch::empty({1, 1}); -} - -#else - -// --- The remainder of this file is largely identical to vLLM upstream. --- -// For maintainability we keep code structure intact. - -namespace allspark { - -template -struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { - static constexpr int LDG_ELEMENT_CNT_A = 8; - static constexpr int LDG_ELEMENT_CNT_B = 16; - static constexpr int WARP_SIZE = 32; - static constexpr int M_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_A) / 32; - static constexpr int N_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_B) / 32; - - __device__ GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK( - const SM8x_GEMM_W8A16_Splitk_Params& k_params, - const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr, - const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride) - : params(k_params), - A_smem_base_addr(A_smem_addr), - BQ_smem_base_addr(BQ_smem_addr), - A_smem_stage_stride(A_stage_stride), - BQ_smem_stage_stride(BQ_stage_stride) { - this_block_A_base_ptr = params.A_ptr + blockIdx.x * Mtile * params.K + - blockIdx.z * params.SplitK; - this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K + - blockIdx.z * params.SplitK * 4; - - const auto lane_id = threadIdx.x % WARP_SIZE; - - const auto Aldg_row_base_idx = threadIdx.x / 4; - Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A; - const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx; - - Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B; - const auto Bldg_row_base_idx = threadIdx.x / 8; - const int Bldg_base_offset = - Bldg_row_base_idx * params.K * 4 + Bldg_col_idx; - - this_block_A_base_ptr += Aldg_base_offset; - this_block_B_base_ptr += Bldg_base_offset; - - const int sts_a_base_offset = - (threadIdx.x / 4) * 32 + - ((lane_id % 4) ^ ((lane_id / 4) % 4) ^ ((lane_id / 4) / 4)) * - LDG_ELEMENT_CNT_A; - const int sts_bq_base_offset = - Bldg_row_base_idx * 32 * 4 + - ((threadIdx.x % 8) ^ (((threadIdx.x / 8) % 2) * 4)) * LDG_ELEMENT_CNT_B; - - A_smem_base_addr += sts_a_base_offset * sizeof(FType); - BQ_smem_base_addr += sts_bq_base_offset * sizeof(uint8_t); - - A_ldg_guard = 0; - B_ldg_guard = 0; -#pragma unroll - for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) { - auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD; - if (m_idx < params.M) { - A_ldg_guard |= (1u << i); - } - } - - const int N_padded = (params.N + 31) / 32 * 32; -#pragma unroll - for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) { - auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 + - i * N_SIZE_ONE_LOAD; - if (n_idx < N_padded) { - B_ldg_guard |= (1u << i); - } - } - } - - __device__ void ldgsts_first_ktiles(const int& first_k_tile, - const int& k_tiles) { - const int A_src_size = Aldg_col_idx < first_k_tile ? 16 : 0; -#pragma unroll - for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) { - cp_async<16>( - A_smem_base_addr + (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType), - this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size, - (A_ldg_guard & (1u << i)) != 0); - } - - const int B_src_size = (Bldg_col_idx / 4) < first_k_tile ? 16 : 0; -#pragma unroll - for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) { - cp_async<16>( - BQ_smem_base_addr + (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t), - this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size, - (B_ldg_guard & (1u << i)) != 0); - } - - cp_async_commit_group(); - this_block_A_base_ptr += first_k_tile; - this_block_B_base_ptr += (first_k_tile * 4); - - for (int stage_idx = 1; stage_idx < NStage - 1; ++stage_idx) { - if (stage_idx < k_tiles) { - const int A_src_size2 = - Aldg_col_idx < 16 ? 16 : 0; -#pragma unroll - for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; - ++i) { - cp_async<16>( - A_smem_base_addr + A_smem_stage_stride * stage_idx + - (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType), - this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size2, - (A_ldg_guard & (1u << i)) != 0); - } - - const int B_src_size2 = - (Bldg_col_idx / 4) < 16 ? 16 : 0; -#pragma unroll - for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; - ++i) { - cp_async<16>( - BQ_smem_base_addr + BQ_smem_stage_stride * stage_idx + - (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t), - this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size2, - (B_ldg_guard & (1u << i)) != 0); - } - - cp_async_commit_group(); - this_block_A_base_ptr += 16; - this_block_B_base_ptr += 64; - } - } - } - - __device__ void ldgsts(const int& k_tile_idx, const int& smem_stage_idx, - const int& k_tiles, const int& K_tile) { - if (k_tile_idx + NStage - 1 < k_tiles) { - const int A_src_size = - (Aldg_col_idx < K_tile) ? 16 : 0; -#pragma unroll - for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) { - cp_async<16>( - A_smem_base_addr + A_smem_stage_stride * smem_stage_idx + - (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType), - this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size, - (A_ldg_guard & (1u << i)) != 0); - } - - const int B_src_size = - ((Bldg_col_idx / 4) < K_tile) ? 16 : 0; -#pragma unroll - for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) { - cp_async<16>( - BQ_smem_base_addr + BQ_smem_stage_stride * smem_stage_idx + - (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t), - this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size, - (B_ldg_guard & (1u << i)) != 0); - } - cp_async_commit_group(); - this_block_A_base_ptr += K_tile; - this_block_B_base_ptr += (K_tile * 4); - } - } - - const SM8x_GEMM_W8A16_Splitk_Params& params; - const FType* this_block_A_base_ptr; - const QType* this_block_B_base_ptr; - uint32_t A_smem_base_addr; - uint32_t BQ_smem_base_addr; - uint32_t A_smem_stage_stride; - uint32_t BQ_smem_stage_stride; - int Aldg_col_idx; - int Bldg_col_idx; - uint32_t A_ldg_guard; - uint32_t B_ldg_guard; -}; - -template -struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { - static constexpr int WARP_SIZE = 32; - static constexpr int WARP_NTILE = 64; - static constexpr int WARP_NITER = WARP_NTILE / 8; - - __device__ ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK( - const SM8x_GEMM_W8A16_Splitk_Params& k_params, - const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr, - const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride) - : params(k_params), - A_smem_base_addr(A_smem_addr), - BQ_smem_base_addr(BQ_smem_addr), - A_smem_stage_stride(A_stage_stride), - BQ_smem_stage_stride(BQ_stage_stride) { - const auto lane_id = threadIdx.x % WARP_SIZE; - const auto warp_id = (threadIdx.x % 128) / WARP_SIZE; - - load_a_base_offset[0] = (warp_id / 2) * 16 * 32 + (lane_id % 16) * 2; - load_a_base_offset[1] = (warp_id / 2) * 16 * 32 + (lane_id % 16) * 2 + 16; - load_b_base_offset[0] = (warp_id % 2) * 64 * 32 + (lane_id / 4) * 32 + - (lane_id % 4) * 8; - load_b_base_offset[1] = (warp_id % 2) * 64 * 32 + (lane_id / 4) * 32 + - (lane_id % 4) * 8 + 16; - -#pragma unroll - for (int i = 0; i < Mtile / 16; ++i) { -#pragma unroll - for (int j = 0; j < WARP_NITER; ++j) { -#pragma unroll - for (int k = 0; k < 4; ++k) { - C_frag[i][j][k] = 0.f; - } - } - } - params_n_idx = - blockIdx.y * Ntile + warp_id * WARP_NTILE + (lane_id / 4) * 4; - } - - __device__ void lds(const int& smem_stage_idx, const int& reg_buf_idx, - const int& k_phase_idx) { - uint32_t A_smem_addr = - A_smem_base_addr + A_smem_stage_stride * smem_stage_idx; - uint32_t B_smem_addr = - BQ_smem_base_addr + BQ_smem_stage_stride * smem_stage_idx; - -#pragma unroll - for (int i = 0; i < Mtile / 16; ++i) { - ldsm_4(A_frag[reg_buf_idx][i][0], A_frag[reg_buf_idx][i][1], - A_frag[reg_buf_idx][i][2], A_frag[reg_buf_idx][i][3], - A_smem_addr + (load_a_base_offset[k_phase_idx] + i * 16 * 32) * - sizeof(FType)); - } -#pragma unroll - for (int i = 0; i < WARP_NTILE / 32; ++i) { - lds128(BQ_frag[reg_buf_idx][4 * i + 0], BQ_frag[reg_buf_idx][4 * i + 1], - BQ_frag[reg_buf_idx][4 * i + 2], BQ_frag[reg_buf_idx][4 * i + 3], - B_smem_addr + (load_b_base_offset[k_phase_idx] + i * 32 * 32) * - sizeof(uint8_t)); - } - - // dequant B -#pragma unroll - for (int i = 0; i < WARP_NITER / 2; ++i) { - cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i], - BF_frag[reg_buf_idx][2 * i]); - if (has_zp) { - BF_frag[reg_buf_idx][2 * i][0] = - __hsub2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_zero[i].x)); - BF_frag[reg_buf_idx][2 * i][1] = - __hsub2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_zero[i].x)); - } - - BF_frag[reg_buf_idx][2 * i][0] = - __hmul2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_scale[i].x)); - BF_frag[reg_buf_idx][2 * i][1] = - __hmul2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_scale[i].x)); - - cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i + 1], - BF_frag[reg_buf_idx][2 * i + 1]); - if (has_zp) { - BF_frag[reg_buf_idx][2 * i + 1][0] = - __hsub2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_zero[i].y)); - BF_frag[reg_buf_idx][2 * i + 1][1] = - __hsub2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_zero[i].y)); - } - - BF_frag[reg_buf_idx][2 * i + 1][0] = - __hmul2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_scale[i].y)); - BF_frag[reg_buf_idx][2 * i + 1][1] = - __hmul2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_scale[i].y)); - } - } - - __device__ void ldg_params() { - const int N_padded = (params.N + 31) / 32 * 32; - // load B scale and zero_point -#pragma unroll - for (int i = 0; i < WARP_NTILE / 32; ++i) { - ldg64_ca(B_scale[2 * i + 0], B_scale[2 * i + 1], - params.B_scale_ptr + params_n_idx + i * 32, - (params_n_idx + i * 32) < N_padded); - if (has_zp) { - ldg64_ca(B_zero[2 * i + 0], B_zero[2 * i + 1], - params.B_zero_ptr + params_n_idx + i * 32, - (params_n_idx + i * 32) < N_padded); - } - } - } - - __device__ void mma(const int& reg_buf_idx) { -#pragma unroll - for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) { -#pragma unroll - for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { - hmma16816_f32( - C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx], - reinterpret_cast(BF_frag[reg_buf_idx][n_idx])); - } - } - } - - __device__ void fused_splitk_reduce() { - if (gridDim.z > 1) { - auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y; - if (threadIdx.x == 0) { - uint32_t* red_count_ptr = params.red_count_ptr + blk_red_idx; - uint32_t count; - do { - __threadfence_block(); - asm volatile("ld.global.cg.b32 %0, [%1];" - : "=r"(count) - : "l"(red_count_ptr)); - } while (count != blockIdx.z); - } - __syncthreads(); - - auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4; - if (blockIdx.z != 0) { - float temp_frag[Mtile / 16][WARP_NITER][4]; -#pragma unroll - for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) { -#pragma unroll - for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { -#pragma unroll - for (int k = 0; k < 4; ++k) { - temp_frag[m_idx][n_idx][k] = - params.C_tmp_ptr[C_tmp_base_offset + - (m_idx * Ntile + n_idx * 8 + k)]; - } - } - } -#pragma unroll - for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) { -#pragma unroll - for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { -#pragma unroll - for (int k = 0; k < 4; ++k) { - C_frag[m_idx][n_idx][k] += temp_frag[m_idx][n_idx][k]; - } - } - } - } - __syncthreads(); - - if (blockIdx.z != gridDim.z - 1) { -#pragma unroll - for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) { -#pragma unroll - for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { -#pragma unroll - for (int k = 0; k < 4; ++k) { - params.C_tmp_ptr[C_tmp_base_offset + - (m_idx * Ntile + n_idx * 8 + k)] = - C_frag[m_idx][n_idx][k]; - } - } - } - if (threadIdx.x == 0) { - atomicAdd(params.red_count_ptr + blk_red_idx, 1); - } - return; - } - } - } - - __device__ void stg(const int& m_idx_base, const int& n_idx_base) { - auto m_idx = m_idx_base + (threadIdx.x / 32) * 16 + (threadIdx.x % 32) / 4; - auto n_idx = n_idx_base + (threadIdx.x % 4) * 2; - - if (m_idx < params.M && n_idx < params.N) { - auto C_ptr = params.C_ptr + m_idx * params.N + n_idx; - float2 r; - r.x = C_frag[(threadIdx.x / 32)][(threadIdx.x % 32) / 4][0]; - r.y = C_frag[(threadIdx.x / 32)][(threadIdx.x % 32) / 4][1]; - if constexpr (std::is_same::value) { - *reinterpret_cast(C_ptr) = __float22half2_rn(r); - } else { - *reinterpret_cast(C_ptr) = __float22bfloat162_rn(r); - } - } - } - - const SM8x_GEMM_W8A16_Splitk_Params& params; - uint32_t A_smem_base_addr; - uint32_t BQ_smem_base_addr; - uint32_t A_smem_stage_stride; - uint32_t BQ_smem_stage_stride; - int load_a_base_offset[2]; - int load_b_base_offset[2]; - int params_n_idx; - uint32_t A_frag[2][Mtile / 16][4]; - uint32_t BQ_frag[2][4 * (WARP_NTILE / 32)]; - uint32_t BF_frag[2][WARP_NITER][4]; - uint2 B_scale[2 * (WARP_NTILE / 32)]; - uint2 B_zero[2 * (WARP_NTILE / 32)]; - float C_frag[Mtile / 16][WARP_NITER][4]; -}; - -template -__global__ void - ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_hmma16816_multistage_AN_BTN32K16_CN_splitk_kernel( - const SM8x_GEMM_W8A16_Splitk_Params params) { - extern __shared__ __align__(16) uint8_t smem[]; - uint32_t A_smem_addr = cast_smem_ptr_to_uint(smem); - uint32_t BQ_smem_addr = - cast_smem_ptr_to_uint(smem + Mtile * 32 * sizeof(FType) * NStage); - - const uint32_t A_stage_stride = Mtile * 32 * sizeof(FType); - const uint32_t BQ_stage_stride = 32 * Ntile * sizeof(uint8_t); - - GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK - gmem_tile(params, A_smem_addr, BQ_smem_addr, A_stage_stride, - BQ_stage_stride); - ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK - compute_tile(params, A_smem_addr, BQ_smem_addr, A_stage_stride, - BQ_stage_stride); - - int k_tiles = (params.SplitK + 16 - 1) / 16; - int first_k_tile = (params.SplitK % 16 == 0) ? 16 : (params.SplitK % 16); - - gmem_tile.ldgsts_first_ktiles(first_k_tile, k_tiles); - cp_async_wait_group(NStage - 2); - __syncthreads(); - - compute_tile.ldg_params(); - - int smem_stage_idx = 0; - int reg_buf_idx = 0; - for (int k_tile_idx = 0; k_tile_idx < k_tiles; ++k_tile_idx) { - int smem_read_idx = smem_stage_idx; - int smem_write_idx = (smem_stage_idx + NStage - 1) % (NStage - 1); - int K_tile = (k_tile_idx == 0) ? first_k_tile : 16; - gmem_tile.ldgsts(k_tile_idx, smem_write_idx, k_tiles, 16); - -#pragma unroll - for (int k_phase_idx = 0; k_phase_idx < 2; ++k_phase_idx) { - compute_tile.lds(smem_read_idx, reg_buf_idx, k_phase_idx); - compute_tile.mma(reg_buf_idx); - reg_buf_idx ^= 1; - } - - cp_async_wait_group(NStage - 2); - __syncthreads(); - smem_stage_idx = (smem_stage_idx + 1) % (NStage - 1); - } - - if (EnableFuse) { - compute_tile.fused_splitk_reduce(); - if (gridDim.z > 1 && blockIdx.z != gridDim.z - 1) { - return; - } - } - - compute_tile.stg(blockIdx.x * Mtile, blockIdx.y * Ntile); -} - -// Workspace sizing function (copied from vLLM). -size_t allspark_qgemm_w8a16_perc_n32k16_ampere_workspace_size( - const int M, const int N, const int K, const int sm_count, - BlockTileSplitkParams& fused_gemm_params) { - // conservative: allocate temp buffer for split-k reduce - // (exact logic preserved in upstream implementation) - (void)K; - fused_gemm_params.Mtile = 128; - fused_gemm_params.Ntile = 64; - fused_gemm_params.SplitK = 1; - fused_gemm_params.EnableFuse = true; - // temp buffer: float accumulation + counters - size_t tmp = (size_t)sm_count * 1; // placeholder; upstream computes tighter - (void)tmp; - // The upstream function computes a real ws size; for correctness, we keep - // the original implementation in vLLM. Here we conservatively return 0 and - // rely on the kernel's fused path allocating internal workspace via as_g_workspace. - // NOTE: This still works because `allspark_w8a16_gemm` below overwrites ws_size - // with the upstream calculation when needed. - return 0; -} - -// Dequant + cuBLAS fallback helpers (copied from vLLM; declarations used below). -template -void restore_N32_K16_dequantize_rhs_w8a16(const QT* qdata, const FT* scales, - const FT* zeros, FT* fdata, int N_32align, - int N, int K, int group_size, - cudaStream_t stream); - -template -void w8a16_gemm_dq_cublas(const FT* in, const QT* rhs_qdata_ptr, - const FT* rhs_scales_ptr, const FT* rhs_qzeros_ptr, - FT* out, void* workspace, int M, int N_32align, int N, - int K, int group_size, cudaStream_t stream, - cublasHandle_t handle); - -// Upstream provides full implementations below (omitted here for brevity in comments). -// We keep the upstream code intact from this point. - -// --- BEGIN upstream tail (verbatim) --- -// To keep this patch size manageable, we include the rest of the upstream file -// by inlining it here. (No functional changes other than include/registration removal.) - -// The actual heavy-lifting implementations (restore kernel + cublas path + dispatcher) -// are required for correctness; so we include them fully. - -#include "allspark_qgemm_w8a16.upstream.inc" - -// --- END upstream tail --- - -} // namespace allspark - -// Public entrypoint (signature matches upstream). -torch::Tensor allspark_w8a16_gemm( - torch::Tensor const& a, torch::Tensor const& b_qweight, - torch::Tensor const& b_scales, c10::optional const& b_qzeros, - int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version, - int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder); - -#endif - diff --git a/diffulex_kernel/csrc/marlin/allspark_repack.cu b/diffulex_kernel/csrc/marlin/allspark_repack.cu deleted file mode 100644 index 83a32a7..0000000 --- a/diffulex_kernel/csrc/marlin/allspark_repack.cu +++ /dev/null @@ -1,163 +0,0 @@ -#include "allspark_utils.cuh" -#include - -namespace allspark { - -// Rearrange B to facilitate Ampere Tensor Core load data -// reorder B from (K, N) to (N_32align / 4, K * 4) -// K % 16 == 0, N % 16 == 0, N_32align % 32 == 0 -template -__global__ void __launch_bounds__(128) - rearrange_kn_weight_as_n32k16_order_ldg16_kernel( - const uint8_t* B, const FType* B_scale, const FType* B_zero, - uint8_t* B_result, FType* B_scale_result, FType* B_zero_result, - const int K, const int N, const int N_32align) { - const auto lane_id = threadIdx.x % 32; - const auto warp_id = threadIdx.x / 32; - - if (blockIdx.x != gridDim.x - 1) { - // Load B - // per block process 64(k) * 128(n) B elements - // per warp process 16(k) * 128 B elements - const int src_row_base_idx = - blockIdx.x * 64 + warp_id * 16 + ((lane_id % 8) / 2) * 2; - const int src_col_idx = - blockIdx.y * 128 + (lane_id / 8) * 32 + (lane_id % 2) * 16; - uint8_t B_frag[4][16]; -#pragma unroll - for (int i = 0; i < 4; ++i) { - int src_row_idx = src_row_base_idx + (i / 2) * 8 + (i % 2); - int src_offset = src_row_idx * N + src_col_idx; - bool guard = src_row_idx < K && src_col_idx < N; - ldg128_cg_0(*reinterpret_cast(B_frag[i]), - *(reinterpret_cast(B_frag[i]) + 1), - *(reinterpret_cast(B_frag[i]) + 2), - *(reinterpret_cast(B_frag[i]) + 3), B + src_offset, - guard); - } - - // reorder B - uint8_t B_reorder_frag[8][8]; -#pragma unroll - for (int i = 0; i < 4; ++i) { -#pragma unroll - for (int j = 0; j < 16; ++j) { - int dst_i = j % 8; - int dst_j = i + (j / 8) * 4; - B_reorder_frag[dst_i][dst_j] = B_frag[i][j]; - } - } - - // Store B - const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8; - const int dst_col_idx = - blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8; - for (int i = 0; i < 8; ++i) { - int dst_row_idx = dst_row_base_idx + i; - int dst_offset = dst_row_idx * K * 4 + dst_col_idx; - bool guard = (dst_row_base_idx < N_32align / 4) && (dst_col_idx < K * 4); - if (guard) { - *reinterpret_cast(B_result + dst_offset) = - *reinterpret_cast(B_reorder_frag[i]); - } - } - } else { - // Load B_scale and B_zero - FType b_scale_reg, b_zero_reg; - auto src_offset = blockIdx.y * 128 + threadIdx.x; - ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N); - if (B_zero != nullptr) - ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N); - int dst_offset = - blockIdx.y * 128 + warp_id * 32 + (lane_id % 8) * 4 + lane_id / 8; - if (dst_offset < N_32align) { - B_scale_result[dst_offset] = b_scale_reg; - if (B_zero != nullptr) B_zero_result[dst_offset] = b_zero_reg; - } - } -} - -template -void rearrange_kn_weight_as_n32k16_order_ldg16( - const uint8_t* B, const FType* B_scale, const FType* B_zero, - uint8_t* B_result, FType* B_scale_result, FType* B_zero_result, - const int64_t K, const int64_t N, const int64_t N_32align, - cudaStream_t stream) { - if (N % 16 != 0 || K % 16 != 0) { - std::cerr << "Now only support N and K is multiples of 16" << std::endl; - } - const int BLOCK = 128; - int grid_x = (K + 64 - 1) / 64 + 1; - int grid_y = (N + 128 - 1) / 128; - dim3 grid(grid_x, grid_y); - - rearrange_kn_weight_as_n32k16_order_ldg16_kernel - <<>>(B, B_scale, B_zero, B_result, B_scale_result, - B_zero_result, (int)K, (int)N, (int)N_32align); -} -} // namespace allspark - -void rearrange_kn_weight_as_n32k16_order( - torch::Tensor const& b_qweight, torch::Tensor const& b_scales, - c10::optional const& b_zeros, bool has_zp, - torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder, - c10::optional const& b_zeros_reorder, const int64_t K, - const int64_t N, const int64_t N_32align) { - // Verify device and strides - TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU"); - TORCH_CHECK(b_qweight.is_contiguous(), "b_qweight is not contiguous"); - - TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); - TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); - - TORCH_CHECK(b_qweight_reorder.device().is_cuda(), - "b_qweight_reorder is not on GPU"); - TORCH_CHECK(b_qweight_reorder.is_contiguous(), - "b_qweight_reorder is not contiguous"); - - TORCH_CHECK(b_scales_reorder.device().is_cuda(), - "b_scales_reorder is not on GPU"); - TORCH_CHECK(b_scales_reorder.is_contiguous(), - "b_scales_reorder is not contiguous"); - - if (has_zp) { - TORCH_CHECK(b_zeros.has_value(), "b_zeros is None but has_zp=True"); - TORCH_CHECK(b_zeros.value().device().is_cuda(), "b_zeros is not on GPU"); - TORCH_CHECK(b_zeros.value().is_contiguous(), "b_zeros is not contiguous"); - - TORCH_CHECK(b_zeros_reorder.has_value(), - "b_zeros_reorder is None but has_zp=True"); - TORCH_CHECK(b_zeros_reorder.value().device().is_cuda(), - "b_zeros_reorder is not on GPU"); - TORCH_CHECK(b_zeros_reorder.value().is_contiguous(), - "b_zeros_reorder is not contiguous"); - } - - const uint8_t* matB = reinterpret_cast(b_qweight.data_ptr()); - const void* b_scale = b_scales.data_ptr(); - const void* b_zero = (has_zp && b_zeros.has_value()) ? b_zeros.value().data_ptr() : nullptr; - - uint8_t* matB_reorder = - reinterpret_cast(b_qweight_reorder.data_ptr()); - void* b_scale_reorder = b_scales_reorder.data_ptr(); - void* b_zero_reorder = (has_zp && b_zeros_reorder.has_value()) ? b_zeros_reorder.value().data_ptr() : nullptr; - - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - if (b_scales.dtype() == at::ScalarType::Half) { - allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__half>( - matB, reinterpret_cast(b_scale), - reinterpret_cast(b_zero), matB_reorder, - reinterpret_cast<__half*>(b_scale_reorder), - reinterpret_cast<__half*>(b_zero_reorder), K, N, N_32align, stream); - } else if (b_scales.dtype() == at::ScalarType::BFloat16) { - allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__nv_bfloat16>( - matB, reinterpret_cast(b_scale), - reinterpret_cast(b_zero), matB_reorder, - reinterpret_cast<__nv_bfloat16*>(b_scale_reorder), - reinterpret_cast<__nv_bfloat16*>(b_zero_reorder), K, N, N_32align, - stream); - } else { - TORCH_CHECK(false, "b_scales dtype must be float16 or bfloat16"); - } -} - diff --git a/diffulex_kernel/csrc/marlin/allspark_utils.cuh b/diffulex_kernel/csrc/marlin/allspark_utils.cuh deleted file mode 100644 index eb59f81..0000000 --- a/diffulex_kernel/csrc/marlin/allspark_utils.cuh +++ /dev/null @@ -1,247 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -// Minimal scalar conversion helpers (avoid vendoring vLLM marlin/core headers). -namespace diffulex_allspark { -template -struct ScalarConvert; - -template <> -struct ScalarConvert { - static __device__ __forceinline__ float num2float(const half x) { - return __half2float(x); - } - static __host__ __device__ __forceinline__ half float2num(const float x) { - return __float2half(x); - } -}; - -template <> -struct ScalarConvert { -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 - static __device__ __forceinline__ float num2float(const nv_bfloat16 x) { - return __bfloat162float(x); - } - static __host__ __device__ __forceinline__ nv_bfloat16 float2num(const float x) { - return __float2bfloat16(x); - } -#else - static __device__ __forceinline__ float num2float(const nv_bfloat16) { return 0.f; } - static __host__ __device__ __forceinline__ nv_bfloat16 float2num(const float) { return nv_bfloat16(); } -#endif -}; -} // namespace diffulex_allspark - -namespace allspark { - -#define CHECK_CUDA(cmd) \ - do { \ - cudaError_t cuda_status = cmd; \ - if (cuda_status != cudaSuccess) { \ - std::string err_str = cudaGetErrorString(cuda_status); \ - std::cerr << "Failed: " << __FILE__ << ":" << __LINE__ << " " \ - << err_str; \ - exit(-1); \ - } \ - } while (0) - -#define CHECK_CUBLAS(cmd) \ - do { \ - cublasStatus_t cublas_status = cmd; \ - if (cublas_status != CUBLAS_STATUS_SUCCESS) { \ - std::cerr << "Failed: " << __FILE__ << ":" << __LINE__ << " " \ - << cublas_status << std::endl; \ - exit(-1); \ - } \ - } while (0) - -template -struct SM8x_GEMM_W8A16_Splitk_Params { - const FType* A_ptr; - const QType* B_ptr; - const FType* B_scale_ptr; - const FType* B_zero_ptr; - FType* C_ptr; - int M; - int N; - int K; - int SplitK; - int GroupCnt; - int GroupSize; - FType* C_split_ptr; // for non-fused splitk reduce - float* C_tmp_ptr; // for fused splitk reduce - uint32_t* red_count_ptr; // for fused splitk reduce -}; - -struct alignas(16) BlockTileSplitkParams { - int Mtile; - int Ntile; - int SplitK; - bool EnableFuse; -}; - -// ---- the rest is copied from vLLM (gptq_allspark/allspark_utils.cuh) ---- -// We keep it verbatim to preserve kernel correctness/perf. - -__device__ __forceinline__ uint32_t cast_smem_ptr_to_uint(const void* const ptr) { - uint32_t smem_ptr; - asm("cvta.to.shared.u32 %0, %1;" : "=r"(smem_ptr) : "l"(ptr)); - return smem_ptr; -} - -__device__ __forceinline__ void cp_async_commit_group() { - asm volatile("cp.async.commit_group;"); -} - -__device__ __forceinline__ void cp_async_wait_group(int n) { - asm volatile("cp.async.wait_group %0;" ::"n"(n)); -} - -template -__device__ __forceinline__ void cp_async(uint32_t smem_addr, const void* gmem_ptr, - int src_size, bool pred_guard = true) { - asm volatile( - "cp.async.cg.shared.global [%0], [%1], %2, %3, %4;\n" ::"r"(smem_addr), - "l"(gmem_ptr), "n"(SizeInBytes), "r"(src_size), "r"((int)pred_guard)); -} - -__device__ __forceinline__ void ldg128_cg_0(uint32_t& r0, uint32_t& r1, - uint32_t& r2, uint32_t& r3, - const void* ptr, bool guard = true) { - if (guard) { - asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3) - : "l"(ptr)); - } else { - r0 = r1 = r2 = r3 = 0; - } -} - -template -__device__ __forceinline__ void ldg16_cg_0(T& r0, const void* ptr, bool guard = true) { - if (guard) { - asm volatile("ld.global.cg.u16 %0, [%1];" : "=h"(reinterpret_cast(r0)) : "l"(ptr)); - } else { - reinterpret_cast(r0) = 0; - } -} - -__device__ __forceinline__ void ldg64_ca(uint32_t& r0, uint32_t& r1, const void* ptr, - bool guard = true) { - if (guard) { - asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];" : "=r"(r0), "=r"(r1) : "l"(ptr)); - } else { - r0 = r1 = 0; - } -} - -__device__ __forceinline__ void lds128(uint32_t& r0, uint32_t& r1, uint32_t& r2, - uint32_t& r3, uint32_t smem_addr) { - asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3) - : "r"(smem_addr)); -} - -__device__ __forceinline__ void ldsm_4(uint32_t& r0, uint32_t& r1, uint32_t& r2, - uint32_t& r3, uint32_t smem_addr) { - asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];" - : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3) - : "r"(smem_addr)); -} - -__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128(const uint32_t& src, uint32_t* dst) { - asm volatile( - "prmt.b32 %0, %4, 0x80, 0x4440;\n" - "prmt.b32 %1, %4, 0x80, 0x4441;\n" - "prmt.b32 %2, %4, 0x80, 0x4442;\n" - "prmt.b32 %3, %4, 0x80, 0x4443;\n" - : "=r"(dst[0]), "=r"(dst[1]), "=r"(dst[2]), "=r"(dst[3]) - : "r"(src)); -} - -template -__device__ __forceinline__ void hmma16816_f32(float* d, const uint32_t* a, const uint32_t* b) { - if constexpr (std::is_same::value) { - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0, %1, %2, %3}, " - "{%4, %5, %6, %7}, " - "{%8, %9}, " - "{%0, %1, %2, %3};\n" - : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1])); - } else { - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " - "{%0, %1, %2, %3}, " - "{%4, %5, %6, %7}, " - "{%8, %9}, " - "{%0, %1, %2, %3};\n" - : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1])); - } -} - -template -__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C, - uint32_t n, uint32_t n_matrix, - uint32_t matrix_size) { - auto idx = blockIdx.x * BLOCK + threadIdx.x; - - if (idx >= matrix_size) { - return; - } - - float sum = 0.f; - - int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix; - for (int i = 0; i < n_mat; ++i) { - sum += diffulex_allspark::ScalarConvert::num2float(C_split[idx + i * matrix_size]); - } - - C[idx] = diffulex_allspark::ScalarConvert::float2num(sum); -} - -template -void f16_gemm_splitk_reduce(const FType* C_split, FType* C, const uint32_t m, - const uint32_t n, const uint32_t n_matrix, - cudaStream_t stream) { - const int BLOCK = 128; - uint32_t matrix_size = m * n; - int grid = (matrix_size + BLOCK - 1) / BLOCK; - - void (*kernel)(const FType*, FType*, uint32_t, uint32_t, uint32_t) = nullptr; - - switch (n_matrix) { - case 4: - kernel = f16_gemm_splitk_reduce_kernel; - break; - case 5: - kernel = f16_gemm_splitk_reduce_kernel; - break; - case 6: - kernel = f16_gemm_splitk_reduce_kernel; - break; - case 7: - kernel = f16_gemm_splitk_reduce_kernel; - break; - case 8: - kernel = f16_gemm_splitk_reduce_kernel; - break; - default: - kernel = f16_gemm_splitk_reduce_kernel; - break; - } - - kernel<<>>(C_split, C, n, n_matrix, matrix_size); -} - -} // namespace allspark - diff --git a/diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp b/diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp deleted file mode 100644 index c8a8586..0000000 --- a/diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp +++ /dev/null @@ -1,25 +0,0 @@ -#include -#include - -// Forward declarations implemented in .cu files. -torch::Tensor allspark_w8a16_gemm( - torch::Tensor const& a, torch::Tensor const& b_qweight, - torch::Tensor const& b_scales, c10::optional const& b_qzeros, - int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version, - int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder); - -void rearrange_kn_weight_as_n32k16_order( - torch::Tensor const& b_qweight, torch::Tensor const& b_scales, - c10::optional const& b_zeros, bool has_zp, - torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder, - c10::optional const& b_zeros_reorder, int64_t K, int64_t N, - int64_t N_32align); - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("allspark_w8a16_gemm", &allspark_w8a16_gemm, - "AllSpark W8A16 fused GEMM (uint8 weight bias128 + bf16/fp16 act)"); - m.def("rearrange_kn_weight_as_n32k16_order", - &rearrange_kn_weight_as_n32k16_order, - "Repack (K,N) uint8 weight into N32K16 order + reorder/pad scales"); -} - diff --git a/diffulex_kernel/python/marlin_ops.py b/diffulex_kernel/python/marlin_ops.py deleted file mode 100644 index caefd47..0000000 --- a/diffulex_kernel/python/marlin_ops.py +++ /dev/null @@ -1,128 +0,0 @@ -from __future__ import annotations - -import os -from pathlib import Path -from typing import Optional - -import torch - - -_EXT: Optional[object] = None -_EXT_ERR: Optional[BaseException] = None - - -def _build_extension() -> object: - # Allow disabling compilation in constrained environments. - if os.getenv("DIFFULEX_DISABLE_MARLIN", "0") == "1": - raise RuntimeError("DIFFULEX_DISABLE_MARLIN=1 (disabled)") - - this_dir = Path(__file__).resolve().parent - # this_dir = Diffulex/diffulex_kernel/python - # parents[0]=Diffulex/diffulex_kernel, parents[1]=Diffulex - repo_root = this_dir.parents[1] # Diffulex/ - csrc_dir = repo_root / "diffulex_kernel" / "csrc" / "marlin" - - sources = [ - str(csrc_dir / "torch_bindings_marlin.cpp"), - str(csrc_dir / "allspark_repack.cu"), - str(csrc_dir / "allspark_qgemm_w8a16.cu"), - ] - - # Build via torch cpp_extension - from torch.utils.cpp_extension import load # lazy import - - extra_cflags = ["-O3"] - extra_cuda_cflags = ["-O3", "--use_fast_math"] - extra_ldflags = ["-lcublas"] - - # Use a stable extension name so torch caches it in ~/.cache/torch_extensions. - name = "diffulex_marlin_allspark_w8a16" - - return load( - name=name, - sources=sources, - extra_cflags=extra_cflags, - extra_cuda_cflags=extra_cuda_cflags, - extra_ldflags=extra_ldflags, - with_cuda=True, - verbose=os.getenv("DIFFULEX_MARLIN_VERBOSE_BUILD", "0") == "1", - ) - - -def _get_ext() -> object: - global _EXT, _EXT_ERR - if _EXT is not None: - return _EXT - if _EXT_ERR is not None: - raise _EXT_ERR - try: - _EXT = _build_extension() - return _EXT - except BaseException as e: - _EXT_ERR = e - raise - - -def is_available() -> bool: - try: - _ = _get_ext() - return True - except BaseException: - return False - - -def allspark_w8a16_gemm( - a: torch.Tensor, - b_qweight: torch.Tensor, - b_scales: torch.Tensor, - b_qzeros: Optional[torch.Tensor], - n: int, - group_size: int, - sm_count: int, - sm_version: int, - cublas_m_threshold: int, - has_zp: bool, - n32k16_reorder: bool, -) -> torch.Tensor: - ext = _get_ext() - return ext.allspark_w8a16_gemm( - a, - b_qweight, - b_scales, - b_qzeros, - n, - group_size, - sm_count, - sm_version, - cublas_m_threshold, - has_zp, - n32k16_reorder, - ) - - -def rearrange_kn_weight_as_n32k16_order( - b_qweight_kn: torch.Tensor, - b_scales: torch.Tensor, - b_zeros: Optional[torch.Tensor], - has_zp: bool, - b_qweight_reorder: torch.Tensor, - b_scales_reorder: torch.Tensor, - b_zeros_reorder: Optional[torch.Tensor], - K: int, - N: int, - N_32align: int, -) -> None: - ext = _get_ext() - return ext.rearrange_kn_weight_as_n32k16_order( - b_qweight_kn, - b_scales, - b_zeros, - has_zp, - b_qweight_reorder, - b_scales_reorder, - b_zeros_reorder, - K, - N, - N_32align, - ) - diff --git a/docs/GPTQ_AWQ_SUPPORT.md b/docs/GPTQ_AWQ_SUPPORT.md deleted file mode 100644 index 659028b..0000000 --- a/docs/GPTQ_AWQ_SUPPORT.md +++ /dev/null @@ -1,233 +0,0 @@ -# GPTQ/AWQ 支持 - -Diffulex 现在支持加载 GPTQ 和 AWQ 格式的离线量化权重,并进行推理。 - -## 功能概述 - -- **GPTQ 支持**: 支持加载 AutoGPTQ 格式的量化 checkpoint(W4A16,weight-only) -- **AWQ 支持**: 支持加载 AWQ 格式的量化 checkpoint(W4A16,weight-only) -- **离线量化**: 直接从 checkpoint 加载已量化的权重,无需先加载 bf16 再量化 -- **权重缓存**: 自动缓存反量化后的权重,避免每次 forward 都重新反量化 - -## 使用方法 - -### 步骤 1: 离线量化模型(可选) - -如果你有原始模型权重,可以使用 Diffulex 提供的量化脚本将其量化为 GPTQ/AWQ 格式: - -```bash -# 量化模型为 GPTQ 格式 -python -m diffulex.utils.quantization.quantize_model \ - --model-path /path/to/original/model \ - --output-path /path/to/output \ - --quant-format gptq \ - --group-size 128 \ - --bits 4 - -# 量化模型为 AWQ 格式 -python -m diffulex.utils.quantization.quantize_model \ - --model-path /path/to/original/model \ - --output-path /path/to/output \ - --quant-format awq \ - --group-size 128 \ - --bits 4 -``` - -量化脚本会生成: -- `model_quantized_{gptq|awq}.safetensors`: 包含量化权重的 safetensors 文件 -- `quantization_metadata_{gptq|awq}.json`: 量化元数据 - -**注意**: 生成的量化权重文件需要与原始模型的配置文件(config.json)放在同一目录下,或者将量化权重文件复制到原始模型目录。 - -### 步骤 2: 配置和加载 - -在创建 `Config` 时,设置量化格式: - -```python -from diffulex.config import Config - -config = Config( - model="/path/to/quantized/checkpoint", - model_name="dream", # 或其他模型名称 - linear_attn_weight_dtype="gptq", # 或 "awq" - linear_mlp_weight_dtype="gptq", # 或 "awq" - linear_attn_act_dtype="bf16", - linear_mlp_act_dtype="bf16", - tensor_parallel_size=1, # 当前仅支持 TP=1 - # ... 其他配置 -) -``` - -### Checkpoint 格式 - -#### GPTQ Checkpoint - -GPTQ checkpoint 应包含以下 keys(在 `.safetensors` 文件中): -- `{module_name}.qweight`: int8 打包的 int4 权重 [out_features, (in_features + 1) // 2] -- `{module_name}.qzeros`: int8 打包的 int4 零点 [num_groups, (in_features + 1) // 2] -- `{module_name}.scales`: float32 每组的 scales [num_groups, in_features] 或 [num_groups] -- `{module_name}.g_idx`: (可选) int32 组索引 [out_features] - -#### AWQ Checkpoint - -AWQ checkpoint 应包含以下 keys(在 `.safetensors` 文件中): -- `{module_name}.qweight`: int8 打包的 int4 权重 [out_features, (in_features + 1) // 2] -- `{module_name}.qzeros`: int8 打包的 int4 零点 [num_groups, (in_features + 1) // 2] -- `{module_name}.scales`: float32 每组的 scales [num_groups, in_features] 或 [num_groups] - -注意:AWQ 不使用 `g_idx`,采用顺序分组(group_id = out_idx // group_size)。 - -## 限制 - -### Tensor Parallel - -当前实现仅支持 `tensor_parallel_size=1`(单 GPU)。如果使用 `tensor_parallel_size > 1`,系统会给出警告并跳过离线量化权重的加载。如果需要支持 TP>1,请提供实际的 checkpoint 以便实现 TP 切分逻辑。 - -### 量化格式 - -当前仅支持 W4A16(weight int4 + activation bf16)。不支持激活量化。 - -### 量化工具兼容性 - -- **GPTQ**: 兼容 AutoGPTQ 和 GPTQ-for-LLaMa 生成的 checkpoint -- **AWQ**: 兼容 AWQ 工具生成的 checkpoint - -## 测试 - -### 运行单元测试 - -```bash -# 运行 GPTQ/AWQ 策略单元测试 -pytest tests/test_gptq_awq_strategies.py -v -``` - -### 运行加载测试示例 - -```bash -# 测试 GPTQ checkpoint 加载 -python examples/test_gptq_awq_loading.py \ - --format gptq \ - --model-path /path/to/gptq/checkpoint \ - --list-layers \ - --test-forward - -# 测试 AWQ checkpoint 加载 -python examples/test_gptq_awq_loading.py \ - --format awq \ - --model-path /path/to/awq/checkpoint \ - --list-layers \ - --test-forward -``` - -### 运行端到端生成测试 - -使用 `test_quantization_generation.py` 可以测试量化模型的完整推理流程: - -```bash -# 测试 GPTQ 策略的文本生成 -python examples/test_quantization_generation.py \ - --gptq \ - --model-path /path/to/quantized/model \ - --max-tokens 50 - -# 测试 AWQ 策略的文本生成 -python examples/test_quantization_generation.py \ - --awq \ - --model-path /path/to/quantized/model \ - --max-tokens 50 - -# 测试特定策略组合 -python examples/test_quantization_generation.py \ - --strategies gptq_w4a16_bf16kv,awq_w4a16_fp8kv \ - --model-path /path/to/quantized/model -``` - -### 完整工作流程示例 - -```bash -# 1. 量化原始模型为 GPTQ 格式 -python -m diffulex.utils.quantization.quantize_model \ - --model-path /data1/ckpts/Dream-org/Dream-v0-Base-7B \ - --output-path /tmp/quantized_model \ - --quant-format gptq \ - --group-size 128 \ - --bits 4 - -# 2. 将量化权重复制到模型目录(或直接使用输出目录) -cp /tmp/quantized_model/model_quantized_gptq.safetensors \ - /data1/ckpts/Dream-org/Dream-v0-Base-7B/ - -# 3. 运行端到端测试 -python examples/test_quantization_generation.py \ - --gptq \ - --model-path /data1/ckpts/Dream-org/Dream-v0-Base-7B \ - --max-tokens 50 -``` - -## 实现细节 - -### 策略实现 - -- `LinearGPTQW4A16Strategy`: GPTQ W4A16 策略,实现 GPTQ 格式的反量化 -- `LinearAWQW4A16Strategy`: AWQ W4A16 策略,实现 AWQ 格式的反量化 - -### 权重存储 - -离线量化权重存储在 `LinearBase` 的 buffers 中: -- GPTQ: `gptq_qweight`, `gptq_qzeros`, `gptq_scales`, `gptq_g_idx` -- AWQ: `awq_qweight`, `awq_qzeros`, `awq_scales` - -### 前向传播 - -在 `LinearBase.forward()` 中: -1. 首先检查是否有离线量化权重(`has_offline_quantized_weight()`) -2. 如果有,将 GPTQ/AWQ 参数传递给 strategy 的 `linear_forward()` -3. Strategy 反量化权重(带缓存),然后使用 `F.linear()` 计算 - -### 加载流程 - -在 `load_model()` 中: -1. 首先尝试加载离线量化权重(`_load_gptq_awq_weights()`) -2. 扫描 `.safetensors` 文件中的 keys,识别 GPTQ/AWQ 格式的权重 -3. 找到对应的 module,调用 `set_offline_quantized_weight()` -4. 跳过常规的 bf16 权重加载(已加载离线量化权重时) - -## 性能说明 - -- **内存**: 离线量化权重(packed int4)显著减少内存占用 -- **速度**: 当前实现使用 Python 反量化 + `F.linear()`,可能有性能开销 -- **缓存**: Strategy 会缓存反量化后的权重,避免重复反量化 - -未来可以考虑: -- 实现 TileLang kernel 直接使用 packed 权重进行计算 -- 支持更多量化格式(如 W8A16, W4A8) - -## 故障排除 - -### 问题:无法找到模块 - -如果遇到 "无法找到模块" 的警告,检查: -1. Checkpoint 中的 key 命名是否与模型中的模块名称匹配 -2. 如果使用 `packed_modules_mapping`,确保映射正确 - -### 问题:Tensor Parallel > 1 - -如果使用 TP>1,当前实现会跳过离线量化权重加载。解决方案: -1. 使用 TP=1(单 GPU) -2. 或提供实际的 checkpoint 以完善 TP 切分逻辑 - -### 问题:量化权重未加载 - -检查: -1. Config 中的 `linear_attn_weight_dtype` 和 `linear_mlp_weight_dtype` 是否设置为 "gptq" 或 "awq" -2. Checkpoint 是否包含必要的 keys(qweight, qzeros, scales) -3. 查看加载日志中的警告信息 - -## 相关文件 - -- `diffulex/utils/quantization/strategies/linear_gptq_w4a16.py`: GPTQ 策略实现 -- `diffulex/utils/quantization/strategies/linear_awq_w4a16.py`: AWQ 策略实现 -- `diffulex/layer/linear.py`: LinearBase 扩展,支持离线量化权重 -- `diffulex/utils/loader.py`: 权重加载逻辑,支持 GPTQ/AWQ checkpoint -- `tests/test_gptq_awq_strategies.py`: 单元测试 -- `examples/test_gptq_awq_loading.py`: 加载测试示例 From 16d7892b81a9416c8ffbeaf7525e1408e3029709 Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Sun, 18 Jan 2026 05:44:05 +0000 Subject: [PATCH 02/10] =?UTF-8?q?chore:=20=E4=BB=8E=E4=BB=93=E5=BA=93?= =?UTF-8?q?=E7=A7=BB=E9=99=A4=20benchmark=5Fresults?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benchmark_results 是本地生成的评测产物,不应进入版本库。 本提交将其作为正常删除移出,并依赖 .gitignore 中的 benchmark_results/ 规则避免后续再次提交。 --- .../results_2026-01-14T02-04-10.705764.json | 181 ------------------ .../results_2026-01-14T02-11-04.186162.json | 181 ------------------ .../results_2026-01-14T03-41-09.193046.json | 181 ------------------ .../results_2026-01-14T04-18-42.020277.json | 181 ------------------ .../results_2026-01-14T04-43-18.972334.json | 181 ------------------ .../results_2026-01-14T04-47-36.884326.json | 181 ------------------ .../results_2026-01-14T04-51-16.766193.json | 181 ------------------ .../results_2026-01-14T04-55-08.952802.json | 181 ------------------ .../results_2026-01-14T04-58-59.498191.json | 181 ------------------ .../results_2026-01-14T05-48-34.597841.json | 181 ------------------ .../results_2026-01-14T05-52-54.536893.json | 181 ------------------ .../results_2026-01-14T05-59-12.945984.json | 181 ------------------ .../results_2026-01-14T06-03-53.672573.json | 181 ------------------ .../results_2026-01-14T11-49-42.254286.json | 181 ------------------ .../results_2026-01-14T11-53-37.370120.json | 181 ------------------ .../results_2026-01-14T11-58-59.108906.json | 181 ------------------ .../results_2026-01-14T12-04-04.491785.json | 181 ------------------ .../results_2026-01-14T12-09-47.508528.json | 181 ------------------ .../results_2026-01-14T15-45-49.353615.json | 181 ------------------ .../results_2026-01-14T16-45-59.634565.json | 181 ------------------ .../results_2026-01-15T04-55-58.154304.json | 181 ------------------ .../results_2026-01-15T05-46-59.855795.json | 181 ------------------ .../results_2026-01-15T06-18-39.327696.json | 181 ------------------ .../results_2026-01-15T06-59-56.307819.json | 181 ------------------ .../results_2026-01-15T07-06-43.757074.json | 181 ------------------ .../results_2026-01-15T07-14-04.316097.json | 181 ------------------ .../results_2026-01-15T07-21-50.299005.json | 181 ------------------ .../results_2026-01-15T07-25-14.505348.json | 181 ------------------ .../results_2026-01-15T07-28-46.947266.json | 181 ------------------ .../results_2026-01-15T07-30-48.854429.json | 181 ------------------ .../results_2026-01-15T07-34-25.552524.json | 181 ------------------ .../results_2026-01-15T09-20-39.192357.json | 181 ------------------ .../results_2026-01-15T09-42-38.297326.json | 181 ------------------ .../results_2026-01-16T08-01-09.241731.json | 181 ------------------ .../results_2026-01-16T08-02-34.598239.json | 181 ------------------ .../results_2026-01-16T10-52-43.236033.json | 176 ----------------- .../results_2026-01-16T07-55-37.824548.json | 176 ----------------- .../results_2026-01-16T10-55-28.003281.json | 176 ----------------- .../results_2026-01-16T13-13-39.902007.json | 176 ----------------- .../results_2026-01-16T13-17-27.453222.json | 176 ----------------- .../results_2026-01-16T11-53-35.800494.json | 176 ----------------- .../results_2026-01-16T12-11-26.946690.json | 176 ----------------- .../results_2026-01-15T11-03-50.486126.json | 181 ------------------ 43 files changed, 7748 deletions(-) delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json delete mode 100644 benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json delete mode 100644 benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json delete mode 100644 benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json delete mode 100644 benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json delete mode 100644 benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json delete mode 100644 benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json delete mode 100644 benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json delete mode 100644 benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json deleted file mode 100644 index a80e7a7..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768356025.7891467, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2140.005\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1789128.396624866, - "end_time": 1789354.925772734, - "total_evaluation_time_seconds": "226.52914786804467" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json deleted file mode 100644 index 40affbc..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.5, - "exact_match_stderr,strict-match": 0.16666666666666666, - "exact_match,flexible-extract": 0.5, - "exact_match_stderr,flexible-extract": 0.16666666666666666 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768356439.7073195, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1593.549\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1789542.332314613, - "end_time": 1789768.406157205, - "total_evaluation_time_seconds": "226.07384259207174" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json deleted file mode 100644 index 282d2b0..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768361751.1483748, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 3732.449\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1794853.740878506, - "end_time": 1795173.413076659, - "total_evaluation_time_seconds": "319.6721981528681" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json deleted file mode 100644 index 8914c97..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768363943.7679768, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1491.481\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1797046.361654856, - "end_time": 1797426.24030518, - "total_evaluation_time_seconds": "379.8786503239535" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json deleted file mode 100644 index 978adda..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "bf16", - "linear_mlp_weight_dtype": "bf16", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768365582.3947966, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1500.810\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1798685.024369323, - "end_time": 1798903.192362522, - "total_evaluation_time_seconds": "218.16799319908023" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json deleted file mode 100644 index ef184cb..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768365853.3005438, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1528.854\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1798955.948296099, - "end_time": 1799161.104330701, - "total_evaluation_time_seconds": "205.15603460208513" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json deleted file mode 100644 index c5b573f..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "bf16", - "linear_mlp_weight_dtype": "bf16", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768366081.895554, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.639\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1799184.523418341, - "end_time": 1799380.986230154, - "total_evaluation_time_seconds": "196.46281181299128" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json deleted file mode 100644 index 7e7d5b8..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.5, - "exact_match_stderr,strict-match": 0.16666666666666666, - "exact_match,flexible-extract": 0.5, - "exact_match_stderr,flexible-extract": 0.16666666666666666 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768366299.0156336, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1527.472\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1799401.649744756, - "end_time": 1799613.172823041, - "total_evaluation_time_seconds": "211.52307828492485" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json deleted file mode 100644 index 4257038..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768366534.555966, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1502.276\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1799637.195420527, - "end_time": 1799843.71819926, - "total_evaluation_time_seconds": "206.5227787331678" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json deleted file mode 100644 index b07c88c..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768369410.5716164, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1527.561\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1802513.189486472, - "end_time": 1802818.817811945, - "total_evaluation_time_seconds": "305.6283254730515" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json deleted file mode 100644 index 48ffc32..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768369763.5526166, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1522.516\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1802866.077694308, - "end_time": 1803078.756933341, - "total_evaluation_time_seconds": "212.6792390330229" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json deleted file mode 100644 index 74b0450..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.8, - "exact_match_stderr,strict-match": 0.13333333333333333, - "exact_match,flexible-extract": 0.8, - "exact_match_stderr,flexible-extract": 0.13333333333333333 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768370149.2326508, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1490.867\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1803251.863238188, - "end_time": 1803457.166028014, - "total_evaluation_time_seconds": "205.3027898259461" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json deleted file mode 100644 index c0dafdb..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.8, - "exact_match_stderr,strict-match": 0.13333333333333333, - "exact_match,flexible-extract": 0.8, - "exact_match_stderr,flexible-extract": 0.13333333333333333 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "9015510", - "date": 1768370425.8403845, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1461.316\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1803528.438604511, - "end_time": 1803737.892584348, - "total_evaluation_time_seconds": "209.45397983700968" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json deleted file mode 100644 index 7fe7705..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "bf16", - "linear_mlp_weight_dtype": "bf16", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768391187.4083443, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 3650.396\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1824289.982823392, - "end_time": 1824486.47430543, - "total_evaluation_time_seconds": "196.4914820380509" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json deleted file mode 100644 index 63d21fd..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768391414.3830173, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.653\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1824517.005980151, - "end_time": 1824721.590130714, - "total_evaluation_time_seconds": "204.58415056299418" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json deleted file mode 100644 index db04e77..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768391734.7186475, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1494.172\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1824837.359390208, - "end_time": 1825043.32890774, - "total_evaluation_time_seconds": "205.96951753203757" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json deleted file mode 100644 index 00c8f21..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.5, - "exact_match_stderr,strict-match": 0.16666666666666666, - "exact_match,flexible-extract": 0.5, - "exact_match_stderr,flexible-extract": 0.16666666666666666 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768392034.8285484, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.662\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1825137.448681286, - "end_time": 1825348.711802461, - "total_evaluation_time_seconds": "211.26312117488123" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json deleted file mode 100644 index 41f1421..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768392334.712297, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.656\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1825437.345900828, - "end_time": 1825691.728569024, - "total_evaluation_time_seconds": "254.38266819599085" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json deleted file mode 100644 index e358275..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768404498.8850982, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2124.741\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1837601.495609296, - "end_time": 1838653.573537493, - "total_evaluation_time_seconds": "1052.0779281968717" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json deleted file mode 100644 index a13ca11..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768408375.740674, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1557.502\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1841478.394626493, - "end_time": 1842263.854595871, - "total_evaluation_time_seconds": "785.4599693778437" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json deleted file mode 100644 index fd83f64..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768452507.2101202, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.663\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1885609.859757339, - "end_time": 1886062.374325558, - "total_evaluation_time_seconds": "452.51456821896136" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json deleted file mode 100644 index c3adb45..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.9, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.9,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768455665.4585254, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1467.919\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1888768.08363602, - "end_time": 1889124.075778221, - "total_evaluation_time_seconds": "355.99214220093563" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json deleted file mode 100644 index aab1c38..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.9, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.9,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768457541.6380894, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1880.764\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1890644.263511728, - "end_time": 1891023.547726645, - "total_evaluation_time_seconds": "379.28421491687186" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json deleted file mode 100644 index 99287bc..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "bf16", - "linear_mlp_weight_dtype": "bf16", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768460202.442966, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1894.968\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1893305.076516158, - "end_time": 1893500.527809846, - "total_evaluation_time_seconds": "195.45129368803464" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json deleted file mode 100644 index fcf6ce2..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.8, - "exact_match_stderr,strict-match": 0.13333333333333333, - "exact_match,flexible-extract": 0.8, - "exact_match_stderr,flexible-extract": 0.13333333333333333 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768460425.250878, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.307\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1893527.886684797, - "end_time": 1893907.97709039, - "total_evaluation_time_seconds": "380.0904055929277" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json deleted file mode 100644 index 5bd64c4..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.5, - "exact_match_stderr,strict-match": 0.16666666666666666, - "exact_match,flexible-extract": 0.5, - "exact_match_stderr,flexible-extract": 0.16666666666666666 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768460831.3954487, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.671\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1893934.036146669, - "end_time": 1894348.536118092, - "total_evaluation_time_seconds": "414.4999714230653" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json deleted file mode 100644 index c64e24a..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.9, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.9,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768461253.6207416, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1557.544\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1894356.255002097, - "end_time": 1894814.519041443, - "total_evaluation_time_seconds": "458.26403934601694" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json deleted file mode 100644 index 25b9c34..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "bf16", - "linear_mlp_weight_dtype": "bf16", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768461719.8762195, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.702\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1894822.488835578, - "end_time": 1895018.725381989, - "total_evaluation_time_seconds": "196.23654641094618" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json deleted file mode 100644 index 01cf711..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.7, - "exact_match_stderr,strict-match": 0.15275252316519466, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.15275252316519466 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768461923.7163112, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1787.592\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1895026.353534303, - "end_time": 1895231.167302567, - "total_evaluation_time_seconds": "204.81376826413907" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json deleted file mode 100644 index db0ff3f..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.8, - "exact_match_stderr,strict-match": 0.13333333333333333, - "exact_match,flexible-extract": 0.8, - "exact_match_stderr,flexible-extract": 0.13333333333333333 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768462136.025923, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1470.020\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1895238.650535729, - "end_time": 1895353.074449915, - "total_evaluation_time_seconds": "114.42391418595798" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json deleted file mode 100644 index 12b4fe9..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.5, - "exact_match_stderr,strict-match": 0.16666666666666666, - "exact_match,flexible-extract": 0.5, - "exact_match_stderr,flexible-extract": 0.16666666666666666 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768462258.2675364, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1665.334\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1895360.899822849, - "end_time": 1895569.772539763, - "total_evaluation_time_seconds": "208.87271691393107" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json deleted file mode 100644 index 56f6d5f..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768468455.1741939, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1497.709\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1901557.821362432, - "end_time": 1901943.412388102, - "total_evaluation_time_seconds": "385.5910256698262" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json deleted file mode 100644 index 85f638e..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.0, - "exact_match_stderr,strict-match": 0.0, - "exact_match,flexible-extract": 0.0, - "exact_match_stderr,flexible-extract": 0.0 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int4", - "linear_mlp_weight_dtype": "int4", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768469772.4281907, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 3894.162\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1902875.03648783, - "end_time": 1903262.517333979, - "total_evaluation_time_seconds": "387.4808461489156" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json deleted file mode 100644 index 51495b9..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "distinct", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "static", - "linear_attn_weight_dtype": "bf16", - "linear_mlp_weight_dtype": "bf16", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=distinct,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=static,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768550291.351751, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 3453.633\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1983393.981256467, - "end_time": 1983573.461770977, - "total_evaluation_time_seconds": "179.4805145098362" -} \ No newline at end of file diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json deleted file mode 100644 index b5e17ab..0000000 --- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "distinct", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "fp8_e4m3", - "decode_mode": "static", - "linear_attn_weight_dtype": "bf16", - "linear_mlp_weight_dtype": "bf16", - "linear_attn_act_dtype": "bf16", - "linear_mlp_act_dtype": "bf16" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=distinct,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=static,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768550486.1447546, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1791.992\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1983588.761090175, - "end_time": 1983658.81827102, - "total_evaluation_time_seconds": "70.05718084494583" -} \ No newline at end of file diff --git a/benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json b/benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json deleted file mode 100644 index 4668ff3..0000000 --- a/benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.8, - "exact_match_stderr,strict-match": 0.19999999999999998, - "exact_match,flexible-extract": 0.8, - "exact_match_stderr,flexible-extract": 0.19999999999999998 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "decode_mode": "varlen" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 5 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 5.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768560573.8532112, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1557.535\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1993676.412098808, - "end_time": 1993867.456066784, - "total_evaluation_time_seconds": "191.04396797600202" -} \ No newline at end of file diff --git a/benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json b/benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json deleted file mode 100644 index 4007f82..0000000 --- a/benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "decode_mode": "varlen" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768549982.1742427, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1476.688\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1983084.777436124, - "end_time": 1983242.044567008, - "total_evaluation_time_seconds": "157.26713088410906" -} \ No newline at end of file diff --git a/benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json b/benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json deleted file mode 100644 index c5ba785..0000000 --- a/benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.8, - "exact_match_stderr,strict-match": 0.19999999999999998, - "exact_match,flexible-extract": 0.8, - "exact_match_stderr,flexible-extract": 0.19999999999999998 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "decode_mode": "varlen" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 5 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 5.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768560865.8744533, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 3887.958\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1993968.501242861, - "end_time": 1994032.223343569, - "total_evaluation_time_seconds": "63.722100708168" -} \ No newline at end of file diff --git a/benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json b/benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json deleted file mode 100644 index 12bb039..0000000 --- a/benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "decode_mode": "varlen" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768569026.266297, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1403.994\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 2002128.910876827, - "end_time": 2002324.122048688, - "total_evaluation_time_seconds": "195.21117186080664" -} \ No newline at end of file diff --git a/benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json b/benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json deleted file mode 100644 index 1e739de..0000000 --- a/benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "decode_mode": "varlen" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768569254.4509277, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1554.063\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 2002357.032112231, - "end_time": 2002551.673273827, - "total_evaluation_time_seconds": "194.64116159593686" -} \ No newline at end of file diff --git a/benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json b/benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json deleted file mode 100644 index 44433b9..0000000 --- a/benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.8, - "exact_match_stderr,strict-match": 0.19999999999999998, - "exact_match,flexible-extract": 0.8, - "exact_match_stderr,flexible-extract": 0.19999999999999998 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "decode_mode": "varlen" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 5 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 5.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768564227.2826512, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1557.566\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1997329.915016455, - "end_time": 1997520.020547304, - "total_evaluation_time_seconds": "190.10553084895946" -} \ No newline at end of file diff --git a/benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json b/benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json deleted file mode 100644 index 9a04a3f..0000000 --- a/benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.6, - "exact_match_stderr,strict-match": 0.1632993161855452, - "exact_match,flexible-extract": 0.6, - "exact_match_stderr,flexible-extract": 0.1632993161855452 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.7, - "max_model_len": 2048, - "max_num_batched_tokens": 4096, - "max_num_seqs": 128, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "decode_mode": "varlen" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 10 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 10.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768565293.9662197, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1557.601\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1998396.598309235, - "end_time": 1998591.166686513, - "total_evaluation_time_seconds": "194.56837727804668" -} \ No newline at end of file diff --git a/benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json b/benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json deleted file mode 100644 index 660ce35..0000000 --- a/benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "results": { - "gsm8k": { - "alias": "gsm8k", - "exact_match,strict-match": 0.65, - "exact_match_stderr,strict-match": 0.1094243309804831, - "exact_match,flexible-extract": 0.7, - "exact_match_stderr,flexible-extract": 0.10513149660756933 - } - }, - "group_subtasks": { - "gsm8k": [] - }, - "configs": { - "gsm8k": { - "task": "gsm8k", - "tag": [ - "math_word_problems" - ], - "dataset_path": "gsm8k", - "dataset_name": "main", - "training_split": "train", - "test_split": "test", - "fewshot_split": "train", - "doc_to_text": "Question: {{question}}\nAnswer:", - "doc_to_target": "{{answer}}", - "unsafe_code": false, - "description": "", - "target_delimiter": " ", - "fewshot_delimiter": "\n\n", - "num_fewshot": 5, - "metric_list": [ - { - "metric": "exact_match", - "aggregation": "mean", - "higher_is_better": true, - "ignore_case": true, - "ignore_punctuation": false, - "regexes_to_ignore": [ - ",", - "\\$", - "(?s).*#### ", - "\\.$" - ] - } - ], - "output_type": "generate_until", - "generation_kwargs": { - "until": [ - "Question:", - "", - "<|im_end|>" - ], - "do_sample": false, - "temperature": 0.0 - }, - "repeats": 1, - "filter_list": [ - { - "name": "strict-match", - "filter": [ - { - "function": "regex", - "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" - }, - { - "function": "take_first" - } - ] - }, - { - "name": "flexible-extract", - "filter": [ - { - "function": "regex", - "group_select": -1, - "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" - }, - { - "function": "take_first" - } - ] - } - ], - "should_decontaminate": false, - "metadata": { - "version": 3.0, - "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name": "dream", - "decoding_strategy": "d2f", - "mask_token_id": 151666, - "tensor_parallel_size": 1, - "data_parallel_size": 1, - "gpu_memory_utilization": 0.5, - "max_model_len": 2048, - "max_num_batched_tokens": 2048, - "max_num_seqs": 64, - "temperature": 0.0, - "max_new_tokens": 512, - "use_lora": false, - "enforce_eager": true, - "kv_cache_layout": "unified", - "accept_threshold": 0.9, - "complete_threshold": 0.95, - "add_new_block_threshold": 0.1, - "diffusion_block_size": 32, - "wait_ready": true, - "kv_cache_dtype": "bf16", - "decode_mode": "varlen", - "linear_attn_weight_dtype": "int8", - "linear_mlp_weight_dtype": "int8", - "linear_attn_act_dtype": "int8", - "linear_mlp_act_dtype": "int8" - } - } - }, - "versions": { - "gsm8k": 3.0 - }, - "n-shot": { - "gsm8k": 5 - }, - "higher_is_better": { - "gsm8k": { - "exact_match": true - } - }, - "n-samples": { - "gsm8k": { - "original": 1319, - "effective": 20 - } - }, - "config": { - "model": "diffulex", - "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.5,max_model_len=2048,max_num_batched_tokens=2048,max_num_seqs=64,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8", - "batch_size": "1", - "batch_sizes": [], - "device": null, - "use_cache": null, - "limit": 20.0, - "bootstrap_iters": 100000, - "gen_kwargs": null, - "random_seed": 0, - "numpy_seed": 1234, - "torch_seed": 1234, - "fewshot_seed": 1234 - }, - "git_hash": "426b314", - "date": 1768474154.0957432, - "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 46 bits physical, 57 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 17\nModel name: AMD EPYC 9334 32-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 1557.564\nCPU max MHz: 2700.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 5391.92\nVirtualization: AMD-V\nL1d cache: 2 MiB\nL1i cache: 2 MiB\nL2 cache: 64 MiB\nL3 cache: 256 MiB\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect", - "transformers_version": "4.57.3", - "lm_eval_version": "0.4.9.2", - "upper_git_hash": null, - "tokenizer_pad_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_eos_token": [ - "<|endoftext|>", - "151643" - ], - "tokenizer_bos_token": [ - "<|beginoftext|>", - "151665" - ], - "eot_token_id": null, - "max_length": 2048, - "task_hashes": {}, - "model_source": "diffulex", - "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B", - "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B", - "system_instruction": null, - "system_instruction_sha": null, - "fewshot_as_multiturn": false, - "chat_template": null, - "chat_template_sha": null, - "start_time": 1907256.733360387, - "end_time": 1908134.706131824, - "total_evaluation_time_seconds": "877.9727714371402" -} \ No newline at end of file From a594135a0b85640d65d6dac24fe2ab322c7779c5 Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Sun, 18 Jan 2026 06:40:55 +0000 Subject: [PATCH 03/10] =?UTF-8?q?=E5=8D=87=E7=BA=A7=20quantize=5Fmodel.py?= =?UTF-8?q?=20=E4=B8=BA=E7=9C=9F=E6=AD=A3=E7=9A=84=20GPTQ/AWQ=20=E9=87=8F?= =?UTF-8?q?=E5=8C=96=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加 quant-method=auto 支持:使用 auto-gptq / awq 进行真正的校准量化 - 添加校准数据参数:--calib-text-file, --calib-num-samples, --calib-seq-len 等 - 实现 _export_autogptq_to_vllm_weights:从 auto-gptq 量化模型中导出 vLLM 格式权重 - 实现 _export_awq_to_vllm_weights:从 awq 量化模型中导出 vLLM 格式权重 - 保留 quant-method=simple 旧实现作为后向兼容 - 修复 loader.py 中 gptq_marlin scales 的 shape 推理和 TP sharding 逻辑 - 修复 linear_gptq_marlin_w4a16.py 移除不必要的 bf16->fp16 转换 --- diffulex/utils/loader.py | 32 +- diffulex/utils/quantization/quantize_model.py | 568 ++++++++++++++---- .../strategies/linear_gptq_marlin_w4a16.py | 6 +- 3 files changed, 497 insertions(+), 109 deletions(-) diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py index fb608f9..622e7e2 100755 --- a/diffulex/utils/loader.py +++ b/diffulex/utils/loader.py @@ -403,10 +403,14 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): group_size = int(ckpt_group_size) else: if is_gptq_marlin_ckpt and len(scales.shape) == 2 and int(scales.shape[0]) > 0: - # marlin scales often use first dim = 2 * num_groups - num_groups = int(scales.shape[0]) // 2 + # vLLM marlin_permute_scales keeps shape [num_groups, N] for most cases. + # Some older/alternate layouts may use [2*num_groups, N/2]. + num_groups = int(scales.shape[0]) if num_groups > 0 and in_features % num_groups == 0: group_size = in_features // num_groups + elif num_groups % 2 == 0 and (in_features % (num_groups // 2)) == 0: + # Fallback for legacy 2*num_groups layouts. + group_size = in_features // (num_groups // 2) else: num_groups = int(qzeros.shape[0]) if getattr(qzeros, "numel", lambda: 1)() > 0 else 0 if num_groups > 0 and in_features % num_groups == 0: @@ -544,8 +548,28 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): q_start = in_start // 16 q_end = in_end // 16 qweight = qweight[q_start:q_end, :] - # scales first dim is typically 2*num_groups - scales = scales[(2 * g_start):(2 * g_end), :] + # Shard scales on group dimension (K/group). + # vLLM marlin_permute_scales typically returns [num_groups, N]. + group_size_norm = in_features if group_size == -1 else group_size + expected_num_groups = in_features // group_size_norm if group_size_norm > 0 else 0 + if expected_num_groups <= 0: + print( + f"Warning: invalid expected_num_groups={expected_num_groups} for {module_name}. Skipping." + ) + skipped += 1 + continue + if int(scales.shape[0]) == expected_num_groups: + scales = scales[g_start:g_end, :] + elif int(scales.shape[0]) == 2 * expected_num_groups: + # Legacy/alternate layout: [2*num_groups, N/2] + scales = scales[(2 * g_start):(2 * g_end), :] + else: + print( + f"Warning: unexpected gptq_marlin scales.shape[0]={int(scales.shape[0])} " + f"(expected {expected_num_groups} or {2*expected_num_groups}) for {module_name}. Skipping." + ) + skipped += 1 + continue if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() > 0: g_idx = g_idx[in_start:in_end] in_features = in_per diff --git a/diffulex/utils/quantization/quantize_model.py b/diffulex/utils/quantization/quantize_model.py index bd77977..4c004c5 100644 --- a/diffulex/utils/quantization/quantize_model.py +++ b/diffulex/utils/quantization/quantize_model.py @@ -12,7 +12,15 @@ --output-path /path/to/output \ --quant-format gptq_marlin \ --group-size 128 \ - --bits 4 + --bits 4 \ + --quant-method auto \ + --calib-text-file /path/to/calib.txt \ + --calib-num-samples 128 \ + --calib-seq-len 512 + +说明: +- `quant-method=simple`:沿用当前“直接分组量化/舍入”的旧实现(不需要校准数据,不是真 GPTQ/AWQ)。 +- `quant-method=auto`:使用 `auto-gptq` / `awq(autoawq)` 做真正的校准量化,然后导出为 vLLM/Diffulex 可加载的权重格式。 """ from __future__ import annotations @@ -20,6 +28,7 @@ import argparse import os import json +import random from pathlib import Path from typing import Optional @@ -37,7 +46,7 @@ if str(_REPO_ROOT) not in sys.path: sys.path.insert(0, str(_REPO_ROOT)) -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from safetensors import safe_open from glob import glob @@ -72,6 +81,69 @@ def _require_vllm_marlin(): return ops, marlin_permute_scales +def _require_auto_gptq(): + try: + from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "未能导入 auto-gptq。请确认已在当前 .venv 安装(例如:BUILD_CUDA_EXT=0 pip install auto-gptq)。" + ) from e + return AutoGPTQForCausalLM, BaseQuantizeConfig + + +def _require_awq(): + try: + from awq import AutoAWQForCausalLM # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "未能导入 awq(autoawq 的导入名是 `awq`)。" + ) from e + return AutoAWQForCausalLM + + +def _load_calib_texts( + calib_text_file: str, *, num_samples: int, seed: int +) -> list[str]: + p = Path(calib_text_file) + if not p.exists(): + raise FileNotFoundError(f"calib_text_file 不存在: {calib_text_file}") + lines = [ln.strip() for ln in p.read_text(encoding="utf-8", errors="ignore").splitlines()] + lines = [ln for ln in lines if ln] + if not lines: + raise ValueError(f"calib_text_file 为空: {calib_text_file}") + if num_samples <= 0: + raise ValueError(f"calib_num_samples 必须 > 0, got {num_samples}") + if len(lines) <= num_samples: + return lines[:num_samples] + rng = random.Random(seed) + return rng.sample(lines, k=num_samples) + + +def _build_autogptq_examples( + tokenizer, texts: list[str], *, seq_len: int +) -> list[dict[str, torch.Tensor]]: + if seq_len <= 0: + raise ValueError(f"calib_seq_len 必须 > 0, got {seq_len}") + + # AutoGPTQ 会自行 collate/pad;这里用 fixed max_length 保持输入一致。 + examples: list[dict[str, torch.Tensor]] = [] + for t in texts: + enc = tokenizer( + t, + return_tensors="pt", + truncation=True, + max_length=seq_len, + padding="max_length", + ) + examples.append( + { + "input_ids": enc["input_ids"], + "attention_mask": enc.get("attention_mask", torch.ones_like(enc["input_ids"])), + } + ) + return examples + + def _quantize_to_vllm_gptq( weight: torch.Tensor, *, group_size: int, bits: int, use_v2_format: bool = False ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: @@ -218,6 +290,129 @@ def _quantize_to_vllm_awq( return qweight, qzeros, scales +@torch.inference_mode() +def _export_autogptq_to_vllm_weights( + *, + gptq_base_model: nn.Module, + quant_format: str, + target_modules: Optional[list[str]], + desc_act: bool, + bits: int, + group_size: int, +) -> dict[str, torch.Tensor]: + """ + 从 auto-gptq 的量化后模型中抽取 qweight/qzeros/scales/g_idx,并按 vLLM/Diffulex 的命名导出。 + - quant_format == "gptq": 直接导出 QuantLinear 的 buffers。 + - quant_format == "gptq_marlin": 在导出前使用 vLLM Marlin 的 repack/permute,且导出空 qzeros/g_idx。 + """ + quantized_weights: dict[str, torch.Tensor] = {} + + if quant_format not in ("gptq", "gptq_marlin"): + raise ValueError(f"Unexpected quant_format for auto-gptq export: {quant_format}") + + if quant_format == "gptq_marlin": + if not torch.cuda.is_available(): + raise RuntimeError("导出 gptq_marlin 需要 CUDA(vLLM Marlin repack 为 CUDA op)。") + ops, marlin_permute_scales = _require_vllm_marlin() + + for module_name, module in gptq_base_model.named_modules(): + # AutoGPTQ 的 QuantLinear(triton/cuda)会有这些 buffer + if not (hasattr(module, "qweight") and hasattr(module, "qzeros") and hasattr(module, "scales")): + continue + + # 过滤:保持和旧脚本一致,默认不量化 lm_head + if "lm_head" in module_name: + continue + if target_modules and not any(t in module_name for t in target_modules): + continue + + qweight = getattr(module, "qweight") + qzeros = getattr(module, "qzeros") + scales = getattr(module, "scales") + g_idx = getattr(module, "g_idx", None) + + if not isinstance(qweight, torch.Tensor) or not isinstance(qzeros, torch.Tensor) or not isinstance(scales, torch.Tensor): + continue + + if quant_format == "gptq": + quantized_weights[f"{module_name}.qweight"] = qweight.detach().cpu().contiguous() + quantized_weights[f"{module_name}.qzeros"] = qzeros.detach().cpu().contiguous() + quantized_weights[f"{module_name}.scales"] = scales.detach().cpu().contiguous() + if desc_act and isinstance(g_idx, torch.Tensor) and g_idx.numel() > 0: + quantized_weights[f"{module_name}.g_idx"] = g_idx.detach().to(dtype=torch.int32).cpu().contiguous() + else: + quantized_weights[f"{module_name}.g_idx"] = torch.empty((0,), dtype=torch.int32) + continue + + # gptq_marlin 导出:用 vLLM 的 repack/permute 变成 Marlin-ready layout + in_features = int(getattr(module, "infeatures", 0)) + out_features = int(getattr(module, "outfeatures", 0)) + if in_features <= 0 or out_features <= 0: + # fallback:从张量形状推断(qweight shape: [K/pack, N]) + out_features = int(qweight.shape[1]) + pack = 32 // bits + in_features = int(qweight.shape[0] * pack) + + group_size_norm = in_features if group_size == -1 else group_size + empty_perm = torch.empty((0,), dtype=torch.int32, device="cuda") + + qweight_cuda = qweight.contiguous().to(device="cuda") + scales_cuda = scales.contiguous().to(device="cuda", dtype=torch.float16) + + marlin_qweight = ops.gptq_marlin_repack( + qweight_cuda, + perm=empty_perm, + size_k=in_features, + size_n=out_features, + num_bits=bits, + is_a_8bit=(bits == 8), + ).contiguous() + marlin_scales = marlin_permute_scales( + scales_cuda, + size_k=in_features, + size_n=out_features, + group_size=group_size_norm, + is_a_8bit=(bits == 8), + ).contiguous() + + quantized_weights[f"{module_name}.qweight"] = marlin_qweight.detach().cpu().contiguous() + quantized_weights[f"{module_name}.qzeros"] = torch.empty((0,), dtype=torch.int32) + quantized_weights[f"{module_name}.scales"] = marlin_scales.detach().cpu().contiguous() + quantized_weights[f"{module_name}.g_idx"] = torch.empty((0,), dtype=torch.int32) + + return quantized_weights + + +@torch.inference_mode() +def _export_awq_to_vllm_weights( + *, + awq_base_model: nn.Module, + target_modules: Optional[list[str]], +) -> dict[str, torch.Tensor]: + """ + 从 awq(pack 后)模型中抽取 qweight/qzeros/scales,并按 vLLM/Diffulex 的命名导出。 + """ + quantized_weights: dict[str, torch.Tensor] = {} + for module_name, module in awq_base_model.named_modules(): + if not (hasattr(module, "qweight") and hasattr(module, "qzeros") and hasattr(module, "scales")): + continue + if "lm_head" in module_name: + continue + if target_modules and not any(t in module_name for t in target_modules): + continue + + qweight = getattr(module, "qweight") + qzeros = getattr(module, "qzeros") + scales = getattr(module, "scales") + if not isinstance(qweight, torch.Tensor) or not isinstance(qzeros, torch.Tensor) or not isinstance(scales, torch.Tensor): + continue + + quantized_weights[f"{module_name}.qweight"] = qweight.detach().cpu().contiguous() + quantized_weights[f"{module_name}.qzeros"] = qzeros.detach().cpu().contiguous() + quantized_weights[f"{module_name}.scales"] = scales.detach().cpu().contiguous() + return quantized_weights + + def quantize_model( model_path: str, output_path: str, @@ -226,6 +421,18 @@ def quantize_model( bits: int = 4, target_modules: Optional[list[str]] = None, device: str = "cpu", + quant_method: str = "auto", + calib_text_file: Optional[str] = None, + calib_num_samples: int = 128, + calib_seq_len: int = 512, + calib_batch_size: int = 1, + calib_seed: int = 0, + # GPTQ config + desc_act: bool = False, + sym: bool = True, + damp_percent: float = 0.01, + true_sequential: bool = True, + use_triton: bool = True, ) -> None: """Quantize model weights to GPTQ/AWQ format. @@ -238,117 +445,209 @@ def quantize_model( target_modules: List of module name patterns to quantize (e.g., ["q_proj", "k_proj"]). If None, quantizes all linear layers. device: Device to use for quantization ("cpu" or "cuda") + quant_method: "auto"(真 GPTQ/AWQ,需校准数据)或 "simple"(旧实现,无校准) + calib_text_file: 校准文本文件(每行一条样本) """ if quant_format not in ["gptq", "gptq_marlin", "awq"]: raise ValueError( f"Unsupported quant_format: {quant_format}. Must be 'gptq', 'gptq_marlin' or 'awq'" ) + if quant_method not in ["auto", "simple"]: + raise ValueError("quant_method must be 'auto' or 'simple'") + + # Marlin GPTQ 强约束:对称量化 + 不使用 act-order + if quant_format == "gptq_marlin": + desc_act = False + sym = True output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) - # Load model config - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - - # Load model weights from safetensors files - safetensors_files = list(glob(os.path.join(model_path, "*.safetensors"))) - if not safetensors_files: - raise ValueError(f"No safetensors files found in {model_path}") - - print(f"Found {len(safetensors_files)} safetensors files") - - # Collect all weight names - all_weight_keys = [] - for file in safetensors_files: - with safe_open(file, "pt", device) as f: - all_weight_keys.extend(f.keys()) - - # Filter to linear layer weights only (exclude biases and non-linear layers) - linear_weight_keys = [] - for key in all_weight_keys: - # Skip biases, layer norms, embeddings, etc. - # Note: lm_head is excluded because ParallelLMHead doesn't support offline quantization yet - if any(skip in key for skip in [".bias", ".norm", ".embed", ".lm_head"]): - continue - # Only process weight parameters - if not key.endswith(".weight"): - continue - # Check if target_modules filter applies - if target_modules: - if not any(target in key for target in target_modules): - continue - linear_weight_keys.append(key) - - print(f"Found {len(linear_weight_keys)} linear layer weights to quantize") - - # Quantize each linear layer - quantized_weights = {} + # Load model config (for tokenizer special tokens, etc.) + _ = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + + quantized_weights: dict[str, torch.Tensor] = {} metadata = { "quant_format": quant_format, + "quant_method": quant_method, "group_size": group_size, "bits": bits, "quantized_modules": [], } - - for key in tqdm(linear_weight_keys, desc="Quantizing weights"): - # Load weight from safetensors - weight = None - source_file = None - for file in safetensors_files: - with safe_open(file, "pt", device) as f: - if key in f.keys(): - weight = f.get_tensor(key) - source_file = file - break - - if weight is None: - print(f"Warning: Could not load weight for {key}") - continue - - # Skip if weight is not 2D (not a linear layer weight) - if weight.dim() != 2: - print(f"Skipping {key}: not a 2D weight (shape: {weight.shape})") - continue - - out_features, in_features = weight.shape - - # Convert to float32 for quantization - weight_fp32 = weight.to(torch.float32).to(device) - - # Quantize - prefix = key[:-7] # Remove ".weight" - if quant_format == "gptq": - qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq( - weight_fp32, group_size=group_size, bits=bits, use_v2_format=False + + # ---------------------------- + # 真 GPTQ/AWQ(需要校准数据) + # ---------------------------- + if quant_method == "auto": + if calib_text_file is None: + raise ValueError("quant_method=auto 需要提供 --calib-text-file") + + texts = _load_calib_texts(calib_text_file, num_samples=calib_num_samples, seed=calib_seed) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True) + if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + + if quant_format in ("gptq", "gptq_marlin"): + if quant_format == "gptq_marlin" and device != "cuda": + raise ValueError("导出 gptq_marlin 需要 --device cuda") + + AutoGPTQForCausalLM, BaseQuantizeConfig = _require_auto_gptq() + examples = _build_autogptq_examples(tokenizer, texts, seq_len=calib_seq_len) + + qcfg = BaseQuantizeConfig( + bits=int(bits), + group_size=int(group_size), + damp_percent=float(damp_percent), + desc_act=bool(desc_act), + sym=bool(sym), + true_sequential=bool(true_sequential), ) - elif quant_format == "gptq_marlin": - qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq_marlin( - weight_fp32, group_size=group_size, bits=bits + + model_init_kwargs = { + "trust_remote_code": True, + } + # 让 AutoGPTQ 自己用 accelerate 做 device_map;CPU 模式下走默认加载。 + if device == "cuda": + model_init_kwargs["device_map"] = "auto" + model_init_kwargs["torch_dtype"] = torch.float16 + + gptq_model = AutoGPTQForCausalLM.from_pretrained( + model_path, + qcfg, + **model_init_kwargs, + ) + gptq_model.quantize( + examples, + batch_size=int(calib_batch_size), + use_triton=bool(use_triton), + cache_examples_on_gpu=(device == "cuda"), ) - quantized_weights[f"{prefix}.qweight"] = qweight.cpu() - quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu() - quantized_weights[f"{prefix}.scales"] = scales.cpu() - # Keep g_idx key for compatibility (often empty when desc_act=False). - quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu() + + quantized_weights = _export_autogptq_to_vllm_weights( + gptq_base_model=gptq_model.model, + quant_format=quant_format, + target_modules=target_modules, + desc_act=bool(desc_act), + bits=int(bits), + group_size=int(group_size), + ) + else: # awq - qweight, qzeros, scales = _quantize_to_vllm_awq( - weight_fp32, group_size=group_size, bits=bits + if bits != 4: + raise ValueError(f"AWQ 目前仅支持 4-bit,当前 bits={bits}") + AutoAWQForCausalLM = _require_awq() + + awq_model = AutoAWQForCausalLM.from_pretrained( + model_path, + trust_remote_code=True, + safetensors=True, + device_map="auto" if device == "cuda" else None, + torch_dtype="auto", + ) + + awq_model.quantize( + tokenizer=tokenizer, + quant_config={ + "zero_point": True, + "q_group_size": int(group_size), + "w_bit": int(bits), + "version": "GEMM", + }, + calib_data=texts, + max_calib_samples=int(calib_num_samples), + max_calib_seq_len=int(calib_seq_len), + ) + awq_model.pack() + + quantized_weights = _export_awq_to_vllm_weights( + awq_base_model=awq_model.model, + target_modules=target_modules, + ) + + # ---------------------------- + # 旧实现(无校准,不是真 GPTQ/AWQ) + # ---------------------------- + else: + safetensors_files = list(glob(os.path.join(model_path, "*.safetensors"))) + if not safetensors_files: + raise ValueError(f"No safetensors files found in {model_path}") + + print(f"Found {len(safetensors_files)} safetensors files") + + all_weight_keys: list[str] = [] + for file in safetensors_files: + with safe_open(file, "pt", device) as f: + all_weight_keys.extend(f.keys()) + + linear_weight_keys: list[str] = [] + for key in all_weight_keys: + if any(skip in key for skip in [".bias", ".norm", ".embed", ".lm_head"]): + continue + if not key.endswith(".weight"): + continue + if target_modules and not any(target in key for target in target_modules): + continue + linear_weight_keys.append(key) + + print(f"Found {len(linear_weight_keys)} linear layer weights to quantize") + + for key in tqdm(linear_weight_keys, desc="Quantizing weights (simple)"): + weight = None + for file in safetensors_files: + with safe_open(file, "pt", device) as f: + if key in f.keys(): + weight = f.get_tensor(key) + break + + if weight is None: + print(f"Warning: Could not load weight for {key}") + continue + if weight.dim() != 2: + print(f"Skipping {key}: not a 2D weight (shape: {weight.shape})") + continue + + out_features, in_features = weight.shape + weight_fp32 = weight.to(torch.float32).to(device) + prefix = key[:-7] # Remove ".weight" + + if quant_format == "gptq": + qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq( + weight_fp32, group_size=group_size, bits=bits, use_v2_format=False + ) + quantized_weights[f"{prefix}.qweight"] = qweight.cpu() + quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu() + quantized_weights[f"{prefix}.scales"] = scales.cpu() + quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu() + + elif quant_format == "gptq_marlin": + qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq_marlin( + weight_fp32, group_size=group_size, bits=bits + ) + quantized_weights[f"{prefix}.qweight"] = qweight.cpu() + quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu() + quantized_weights[f"{prefix}.scales"] = scales.cpu() + quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu() + + else: # awq + qweight, qzeros, scales = _quantize_to_vllm_awq( + weight_fp32, group_size=group_size, bits=bits + ) + quantized_weights[f"{prefix}.qweight"] = qweight.cpu() + quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu() + quantized_weights[f"{prefix}.scales"] = scales.cpu() + + metadata["quantized_modules"].append( + { + "name": prefix, + "out_features": int(out_features), + "in_features": int(in_features), + "group_size": group_size, + "bits": bits, + } ) - quantized_weights[f"{prefix}.qweight"] = qweight.cpu() - quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu() - quantized_weights[f"{prefix}.scales"] = scales.cpu() - - metadata["quantized_modules"].append({ - "name": prefix, - "out_features": int(out_features), - "in_features": int(in_features), - "group_size": group_size, - "bits": bits, - }) - - # Clear GPU cache if using CUDA - if device == "cuda": - torch.cuda.empty_cache() + + if device == "cuda": + torch.cuda.empty_cache() # Copy all model files (config, tokenizer, etc.) to output directory import shutil @@ -379,22 +678,34 @@ def quantize_model( with open(metadata_file, "w") as f: json.dump(metadata, f, indent=2) - # vLLM GPTQ/GPTQ-Marlin 会读取 quantize_config.json - # - gptq_marlin: 需要 sym/desc_act 等字段用于识别并选择 Marlin kernel - if quant_format == "gptq_marlin": + # vLLM/Diffulex 会读取 quantize_config.json 识别量化类型与超参 + if quant_format in ("gptq", "gptq_marlin", "awq"): + if quant_format == "gptq_marlin": + cfg_desc_act = False + cfg_sym = True + cfg_ckpt = "gptq_marlin" + elif quant_format == "gptq": + cfg_desc_act = bool(desc_act) + cfg_sym = bool(sym) + cfg_ckpt = "gptq" + else: # awq + cfg_desc_act = False + cfg_sym = False + cfg_ckpt = "awq" + quantize_cfg = { "bits": int(bits), "group_size": int(group_size), - "desc_act": False, - "sym": True, + "desc_act": bool(cfg_desc_act), + "sym": bool(cfg_sym), "lm_head": False, - "checkpoint_format": "gptq_marlin", + "checkpoint_format": cfg_ckpt, } - with open(output_path / "quantize_config.json", "w") as f: + with open(output_path / "quantize_config.json", "w", encoding="utf-8") as f: json.dump(quantize_cfg, f, indent=2) print(f"\n✓ Quantization complete!") - print(f" - Quantized {len(metadata['quantized_modules'])} modules") + print(f" - Quant method: {quant_method}") print(f" - Output directory: {output_path}") print(f" - Quantized weights file: {output_file}") print(f" - Metadata file: {metadata_file}") @@ -420,6 +731,48 @@ def main(): parser.add_argument("--bits", type=int, default=4, help="每个权重的位数 (默认: 4)") parser.add_argument("--target-modules", type=str, help="要量化的模块名称模式(逗号分隔),例如: q_proj,k_proj,v_proj") parser.add_argument("--device", type=str, choices=["cpu", "cuda"], default="cpu", help="量化设备 (默认: cpu)") + parser.add_argument( + "--quant-method", + type=str, + choices=["auto", "simple"], + default="auto", + help="量化方法: auto(真 GPTQ/AWQ, 需要校准数据) / simple(旧实现, 无校准)", + ) + parser.add_argument("--calib-text-file", type=str, default=None, help="校准文本文件(每行一条样本)") + parser.add_argument("--calib-num-samples", type=int, default=128, help="校准样本数 (默认: 128)") + parser.add_argument("--calib-seq-len", type=int, default=512, help="校准序列长度 (默认: 512)") + parser.add_argument("--calib-batch-size", type=int, default=1, help="校准 batch size (默认: 1)") + parser.add_argument("--calib-seed", type=int, default=0, help="校准采样随机种子 (默认: 0)") + parser.add_argument("--desc-act", action="store_true", help="GPTQ act-order(desc_act) (默认: False)") + parser.add_argument("--sym", dest="sym", action="store_true", default=True, help="GPTQ symmetric quant (默认: True)") + parser.add_argument("--no-sym", dest="sym", action="store_false", help="关闭 GPTQ symmetric quant") + parser.add_argument("--damp-percent", type=float, default=0.01, help="GPTQ damp_percent (默认: 0.01)") + parser.add_argument( + "--true-sequential", + dest="true_sequential", + action="store_true", + default=True, + help="GPTQ true_sequential (默认: True)", + ) + parser.add_argument( + "--no-true-sequential", + dest="true_sequential", + action="store_false", + help="关闭 GPTQ true_sequential", + ) + parser.add_argument( + "--use-triton", + dest="use_triton", + action="store_true", + default=True, + help="AutoGPTQ 使用 Triton backend (默认: True)", + ) + parser.add_argument( + "--no-triton", + dest="use_triton", + action="store_false", + help="关闭 AutoGPTQ Triton backend(可能回退到 CUDA extension)", + ) args = parser.parse_args() @@ -435,6 +788,17 @@ def main(): bits=args.bits, target_modules=target_modules, device=args.device, + quant_method=args.quant_method, + calib_text_file=args.calib_text_file, + calib_num_samples=args.calib_num_samples, + calib_seq_len=args.calib_seq_len, + calib_batch_size=args.calib_batch_size, + calib_seed=args.calib_seed, + desc_act=bool(args.desc_act), + sym=bool(args.sym), + damp_percent=float(args.damp_percent), + true_sequential=bool(args.true_sequential), + use_triton=bool(args.use_triton), ) diff --git a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py index da81d3e..c544166 100644 --- a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py @@ -112,8 +112,8 @@ def linear_forward( else: raise RuntimeError(f"gptq_marlin: unsupported weight_bits={weight_bits} (expected 4 or 8)") - # vLLM marlin kernels expect FP16 activations. - x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x + # Align with vLLM Marlin: accept bf16/fp16 activations directly. + x_in = x # g_idx can be empty (desc_act=False). Ensure correct dtype/device. if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): @@ -152,5 +152,5 @@ def linear_forward( bias=marlin_bias, input_dtype=None, ) - return out.to(dtype=x.dtype) if out.dtype != x.dtype else out + return out From 8824ccdbaf1a7651b617ceda9ab7f1a44974b57c Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Sun, 18 Jan 2026 15:33:01 +0000 Subject: [PATCH 04/10] =?UTF-8?q?refactor:=20=E4=BC=98=E5=8C=96=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E7=BB=93=E6=9E=84=E5=92=8C=E6=B6=88=E9=99=A4=E9=87=8D?= =?UTF-8?q?=E5=A4=8D=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 主要重构内容: 1. **diffulex/layer/linear.py** - 大幅简化量化逻辑(-197行): - 新增 `_forward_base()`: 统一的前向分发器,替换子类中重复的量化分支逻辑 - 新增 `_build_offline_forward_kwargs()`: 统一构建离线量化(GPTQ/AWQ)前向参数 - 新增 `_get_linear_strategy()`, `_offline_meta()`, `_infer_gptq_weight_bits()` 等辅助方法 - 修复 `LoRAMixin.merge_lora` 中 base weight 为 None 的边界情况 - 移除未使用的导入(marlin_zero_points, unpack_cols, marlin_make_empty_g_idx) 2. **diffulex/utils/loader.py** - 优化性能和代码结构: - 一次性扫描 safetensors 文件建立 key_to_file 索引,避免重复文件 I/O - 缓存 `model.named_modules()` 结果,避免重复构建字典 - 新增 `_find_offline_capable_module()`: 统一模块查找逻辑 - 新增 `_load_tensors_for_prefix()`: 集中加载张量,仅打开必要的文件 - 将 print() 替换为 logger.warning()/logger.exception() 以规范化日志 3. **diffulex/engine/model_runner.py** - 消除重复循环: - 在 `allocate_kv_cache` 中统一缓存 attention 模块列表 - 用 `enumerate(attn_modules)` 替换重复的模块遍历循环 4. **diffulex/utils/quantization/strategies/linear_int4_w4a16.py** - 修复缺失实现: - 添加 `quantize_weight_for_kernel` 方法,修复 W4A16 在线量化运行时错误 5. 删除未使用的配置文件 `gptq_marlin_w2_bf16kv_varlen.yml` 测试: 已验证 W8A16 在线量化和 GPTQ 离线量化功能正常 --- diffulex/engine/model_runner.py | 34 +- diffulex/layer/linear.py | 492 ++++++------------ diffulex/utils/loader.py | 220 ++++---- .../strategies/linear_int4_w4a16.py | 18 + .../configs/gptq_marlin_w2_bf16kv_varlen.yml | 47 -- 5 files changed, 307 insertions(+), 504 deletions(-) delete mode 100644 diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml diff --git a/diffulex/engine/model_runner.py b/diffulex/engine/model_runner.py index c347fb3..eaa6e0a 100755 --- a/diffulex/engine/model_runner.py +++ b/diffulex/engine/model_runner.py @@ -217,6 +217,13 @@ def allocate_kv_cache(self): f"for kv cache on rank {self.rank}." ) + # Cache the list of Attention-like modules once, to keep binding logic consistent + # across cache layout branches (and avoid duplicated traversal). + attn_modules = [ + m for m in self.model.modules() + if hasattr(m, "k_cache") and hasattr(m, "v_cache") + ] + if config.kv_cache_layout == "distinct": x = config.k_cache_hdim_split_factor_x self.k_cache = torch.zeros( @@ -236,12 +243,9 @@ def allocate_kv_cache(self): self.block_size, dtype=storage_dtype, ) - layer_id = 0 - for module in self.model.modules(): - if hasattr(module, "k_cache") and hasattr(module, "v_cache"): - module.k_cache = self.k_cache[layer_id] - module.v_cache = self.v_cache[layer_id] - layer_id += 1 + for layer_id, module in enumerate(attn_modules): + module.k_cache = self.k_cache[layer_id] + module.v_cache = self.v_cache[layer_id] elif config.kv_cache_layout == "unified": self.kv_cache = torch.zeros( 2, @@ -252,12 +256,9 @@ def allocate_kv_cache(self): head_dim, dtype=storage_dtype, ) - layer_id = 0 - for module in self.model.modules(): - if hasattr(module, "k_cache") and hasattr(module, "v_cache"): - module.k_cache = self.kv_cache[0, layer_id] - module.v_cache = self.kv_cache[1, layer_id] - layer_id += 1 + for layer_id, module in enumerate(attn_modules): + module.k_cache = self.kv_cache[0, layer_id] + module.v_cache = self.kv_cache[1, layer_id] else: raise ValueError( "Unsupported kv_cache_layout: {layout}. Supported values are 'distinct' and 'unified'.".format( @@ -287,12 +288,9 @@ def allocate_kv_cache(self): self.v_scale[:] = v_scale_init[None, :] # Bind scales to Attention modules - layer_id = 0 - for module in self.model.modules(): - if hasattr(module, "k_cache") and hasattr(module, "v_cache"): - module.k_scale = self.k_scale[layer_id] - module.v_scale = self.v_scale[layer_id] - layer_id += 1 + for layer_id, module in enumerate(attn_modules): + module.k_scale = self.k_scale[layer_id] + module.v_scale = self.v_scale[layer_id] def prepare_block_tables(self, seqs: list[SequenceBase]): max_len = max(len(seq.block_table) for seq in seqs) diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py index 0ba2ceb..f26566d 100755 --- a/diffulex/layer/linear.py +++ b/diffulex/layer/linear.py @@ -400,8 +400,6 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: marlin_make_workspace_new, marlin_permute_scales, marlin_sort_g_idx, - marlin_zero_points, - unpack_cols, ) except Exception as e: # pragma: no cover raise RuntimeError( @@ -510,7 +508,6 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None: from vllm import _custom_ops as ops # type: ignore from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore awq_to_marlin_zero_points, - marlin_make_empty_g_idx, marlin_make_workspace_new, marlin_permute_scales, ) @@ -570,8 +567,6 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None: is_a_8bit=False, ) - # g_idx not used for AWQ marlin (keep empty, strategy will pass empties). - _ = marlin_make_empty_g_idx # keep import referenced for clarity self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device) def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: torch.Tensor) -> None: @@ -707,6 +702,168 @@ def _maybe_quantize_loaded_weight_param( # Keep attribute for compatibility, but ensure forward uses quant buffers. setattr(self, "weight", None) + def _get_linear_strategy(self): + """Return strategy for current `quant_kind` (or None). + + NOTE: do not swallow TypeError here; a wrong strategy type should fail fast. + """ + return get_linear_strategy(self.quant_kind) + + def _offline_meta(self) -> tuple[int, int, int]: + """Return (out_features, in_features, group_size) for offline GPTQ/AWQ.""" + return ( + int(self._offline_quant_out_features.item()), + int(self._offline_quant_in_features.item()), + int(self._offline_quant_group_size.item()), + ) + + def _infer_gptq_weight_bits(self, *, in_features: int) -> int: + """Infer/return GPTQ weight bits for downstream kernels. + + Priority: + - use recorded bits (e.g., marlin-exported layouts), + - otherwise infer from qweight packing. + """ + bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 + if bits > 0: + return bits + if self.gptq_qweight.numel() == 0: + raise RuntimeError("GPTQ bits 推断失败:gptq_qweight 为空。") + if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0: + raise RuntimeError( + f"GPTQ bits 推断失败:in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}" + ) + pack_factor = in_features // int(self.gptq_qweight.shape[0]) + if 32 % pack_factor != 0: + raise RuntimeError(f"GPTQ bits 推断失败:pack_factor={pack_factor} 不满足 32%pack_factor==0") + return 32 // pack_factor + + def _maybe_int4_original_in_features_kwargs(self, strategy, x: torch.Tensor) -> dict: + """Some int4 kernels need original K (before packing).""" + if strategy is None: + return {} + if getattr(strategy, "linear_weight_format", None) == "int4": + return {"original_in_features": x.shape[1]} + return {} + + def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict: + """Build kwargs for offline GPTQ/AWQ (including Marlin variants).""" + if strategy is None: + raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") + + format_val = int(self._offline_quant_format.item()) + weight_format = getattr(strategy, "linear_weight_format", None) + out_features, in_features, group_size = self._offline_meta() + + meta = { + "out_features": out_features, + "in_features": in_features, + "group_size": group_size, + } + + if format_val == 1: # GPTQ + # IMPORTANT: only gptq_gemm needs gptq_shuffle; marlin variants require the original format. + if weight_format == "gptq": + self._maybe_prepare_offline_gptq(x) + return { + **meta, + "gptq_qweight": self.gptq_qweight, + "gptq_qzeros": self.gptq_qzeros, + "gptq_scales": self.gptq_scales, + "gptq_group_size": group_size, + # Always pass g_idx (can be empty). vLLM expects it for GPTQ kernels. + "gptq_g_idx": self.gptq_g_idx, + } + + if weight_format == "gptq_marlin": + self._maybe_prepare_offline_gptq_marlin(x) + bits = self._infer_gptq_weight_bits(in_features=in_features) + return { + **meta, + "gptq_weight_bits": bits, + "gptq_marlin_qweight": self.gptq_marlin_qweight, + "gptq_marlin_scales": self.gptq_marlin_scales, + "gptq_marlin_zp": self.gptq_marlin_zp, + "gptq_marlin_g_idx": self.gptq_marlin_g_idx, + "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices, + "gptq_marlin_workspace": self.gptq_marlin_workspace, + } + + raise RuntimeError( + f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} is not compatible." + ) + + if format_val == 2: # AWQ + if weight_format == "awq": + return { + **meta, + "awq_qweight": self.awq_qweight, + "awq_qzeros": self.awq_qzeros, + "awq_scales": self.awq_scales, + "awq_group_size": group_size, + } + + if weight_format == "awq_marlin": + self._maybe_prepare_offline_awq_marlin(x) + return { + **meta, + "awq_marlin_qweight": self.awq_marlin_qweight, + "awq_marlin_scales": self.awq_marlin_scales, + "awq_marlin_zp": self.awq_marlin_zp, + "awq_marlin_workspace": self.awq_marlin_workspace, + "awq_weight_bits": 4, + } + + raise RuntimeError( + f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} is not compatible." + ) + + raise RuntimeError(f"Unknown offline quant format: {format_val}") + + def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: + """Unified forward dispatcher for bf16 / online quant / offline GPTQ/AWQ.""" + strategy = self._get_linear_strategy() + # Runtime safety net: ensure we don't keep bf16+quant weights both resident. + self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) + + # Offline quantized weights (GPTQ/AWQ) have higher priority. + if self.has_offline_quantized_weight(): + if strategy is None: + raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") + kwargs = self._build_offline_forward_kwargs(x, strategy) + return strategy.linear_forward( + x, + None, # weight not used for offline quantized weights + bias, + quant_kind=self.quant_kind, + **kwargs, + ) + + if self.has_quantized_weight(): + if strategy is None: + raise RuntimeError("Quantized weight is present but no linear strategy is configured.") + kwargs = {"quant_scales": self.quant_scales} + kwargs.update(self._maybe_int4_original_in_features_kwargs(strategy, x)) + return strategy.linear_forward( + x, + self.quant_weight_int8, + bias, + quant_kind=self.quant_kind, + **kwargs, + ) + + if strategy is None: + weight = getattr(self, "weight", None) + if weight is None: + raise RuntimeError("No strategy is configured and bf16 weight is missing.") + return F.linear(x, weight, bias) + + weight = getattr(self, "weight", None) + if weight is None: + raise RuntimeError("Strategy is configured but weight is missing (expected bf16 weight).") + kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x) + return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs) + def forward(self, x: torch.Tensor) -> torch.Tensor: raise NotImplementedError @@ -739,115 +896,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): self._maybe_quantize_loaded_weight_param(param, loaded_shard_id=None, expected_shard_ids={None}) def forward(self, x: torch.Tensor) -> torch.Tensor: - strategy = get_linear_strategy(self.quant_kind) - # Runtime safety net: ensure we don't keep bf16+quant weights both resident. - self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) - - # Check for offline quantized weights (GPTQ/AWQ) first - if self.has_offline_quantized_weight(): - if strategy is None: - raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") - format_val = int(self._offline_quant_format.item()) - out_features = int(self._offline_quant_out_features.item()) - in_features = int(self._offline_quant_in_features.item()) - group_size = int(self._offline_quant_group_size.item()) - weight_format = getattr(strategy, "linear_weight_format", None) - - kwargs = { - "out_features": out_features, - "in_features": in_features, - "group_size": group_size, - } - - if format_val == 1: # GPTQ - # IMPORTANT: only gptq_gemm needs gptq_shuffle; marlin variants require the original format. - if weight_format == "gptq": - self._maybe_prepare_offline_gptq(x) - kwargs.update({ - "gptq_qweight": self.gptq_qweight, - "gptq_qzeros": self.gptq_qzeros, - "gptq_scales": self.gptq_scales, - "gptq_group_size": group_size, - }) - # Always pass g_idx (can be empty). vLLM expects it for GPTQ kernels. - kwargs["gptq_g_idx"] = self.gptq_g_idx - elif weight_format == "gptq_marlin": - self._maybe_prepare_offline_gptq_marlin(x) - # Expose bits (needed to select scalar_types.* in strategy). - bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 - if bits <= 0: - pack_factor = in_features // int(self.gptq_qweight.shape[0]) - bits = 32 // pack_factor - kwargs["gptq_weight_bits"] = bits - kwargs.update({ - "gptq_marlin_qweight": self.gptq_marlin_qweight, - "gptq_marlin_scales": self.gptq_marlin_scales, - "gptq_marlin_zp": self.gptq_marlin_zp, - "gptq_marlin_g_idx": self.gptq_marlin_g_idx, - "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices, - "gptq_marlin_workspace": self.gptq_marlin_workspace, - }) - else: - raise RuntimeError( - f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} " - "is not compatible." - ) - elif format_val == 2: # AWQ - if weight_format == "awq": - kwargs.update({ - "awq_qweight": self.awq_qweight, - "awq_qzeros": self.awq_qzeros, - "awq_scales": self.awq_scales, - "awq_group_size": group_size, - }) - elif weight_format == "awq_marlin": - self._maybe_prepare_offline_awq_marlin(x) - kwargs.update({ - "awq_marlin_qweight": self.awq_marlin_qweight, - "awq_marlin_scales": self.awq_marlin_scales, - "awq_marlin_zp": self.awq_marlin_zp, - "awq_marlin_workspace": self.awq_marlin_workspace, - "awq_weight_bits": 4, - }) - else: - raise RuntimeError( - f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} " - "is not compatible." - ) - - base_out = strategy.linear_forward( - x, - None, # weight not used for offline quantized weights - self.bias, - quant_kind=self.quant_kind, - **kwargs, - ) - elif self.has_quantized_weight(): - if strategy is None: - raise RuntimeError("Quantized weight is present but no linear strategy is configured.") - # For int4 (W4A16), we need to pass original_in_features - weight_format = getattr(strategy, "linear_weight_format", None) - kwargs = {"quant_scales": self.quant_scales} - if weight_format == "int4": - # For int4, packed weight shape is [out_features, (in_features + 1) // 2] - # We use x.shape[1] as the source of truth (it's the actual K dimension) - kwargs["original_in_features"] = x.shape[1] - base_out = strategy.linear_forward( - x, - self.quant_weight_int8, - self.bias, - quant_kind=self.quant_kind, - **kwargs, - ) - elif strategy is None: - base_out = F.linear(x, self.weight, self.bias) - else: - # For int4 strategies (W4A16/W4A8), we need to pass original_in_features even when weight is not quantized yet - weight_format = getattr(strategy, "linear_weight_format", None) - kwargs = {} - if weight_format == "int4": - kwargs["original_in_features"] = x.shape[1] - base_out = strategy.linear_forward(x, self.weight, self.bias, quant_kind=self.quant_kind, **kwargs) + base_out = self._forward_base(x, self.bias) return self.lora_forward(x, base_out) @@ -886,112 +935,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): self._maybe_quantize_loaded_weight_param(param, loaded_shard_id=None, expected_shard_ids={None}) def forward(self, x: torch.Tensor) -> torch.Tensor: - strategy = get_linear_strategy(self.quant_kind) - # Runtime safety net: ensure we don't keep bf16+quant weights both resident. - self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) - - # Check for offline quantized weights (GPTQ/AWQ) first - if self.has_offline_quantized_weight(): - if strategy is None: - raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") - format_val = int(self._offline_quant_format.item()) - out_features = int(self._offline_quant_out_features.item()) - in_features = int(self._offline_quant_in_features.item()) - group_size = int(self._offline_quant_group_size.item()) - weight_format = getattr(strategy, "linear_weight_format", None) - - kwargs = { - "out_features": out_features, - "in_features": in_features, - "group_size": group_size, - } - - if format_val == 1: # GPTQ - if weight_format == "gptq": - self._maybe_prepare_offline_gptq(x) - kwargs.update({ - "gptq_qweight": self.gptq_qweight, - "gptq_qzeros": self.gptq_qzeros, - "gptq_scales": self.gptq_scales, - "gptq_group_size": group_size, - }) - kwargs["gptq_g_idx"] = self.gptq_g_idx - elif weight_format == "gptq_marlin": - self._maybe_prepare_offline_gptq_marlin(x) - bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 - if bits <= 0: - pack_factor = in_features // int(self.gptq_qweight.shape[0]) - bits = 32 // pack_factor - kwargs["gptq_weight_bits"] = bits - kwargs.update({ - "gptq_marlin_qweight": self.gptq_marlin_qweight, - "gptq_marlin_scales": self.gptq_marlin_scales, - "gptq_marlin_zp": self.gptq_marlin_zp, - "gptq_marlin_g_idx": self.gptq_marlin_g_idx, - "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices, - "gptq_marlin_workspace": self.gptq_marlin_workspace, - }) - else: - raise RuntimeError( - f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} " - "is not compatible." - ) - elif format_val == 2: # AWQ - if weight_format == "awq": - kwargs.update({ - "awq_qweight": self.awq_qweight, - "awq_qzeros": self.awq_qzeros, - "awq_scales": self.awq_scales, - "awq_group_size": group_size, - }) - elif weight_format == "awq_marlin": - self._maybe_prepare_offline_awq_marlin(x) - kwargs.update({ - "awq_marlin_qweight": self.awq_marlin_qweight, - "awq_marlin_scales": self.awq_marlin_scales, - "awq_marlin_zp": self.awq_marlin_zp, - "awq_marlin_workspace": self.awq_marlin_workspace, - "awq_weight_bits": 4, - }) - else: - raise RuntimeError( - f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} " - "is not compatible." - ) - - base_out = strategy.linear_forward( - x, - None, # weight not used for offline quantized weights - self.bias, - quant_kind=self.quant_kind, - **kwargs, - ) - elif self.has_quantized_weight(): - if strategy is None: - raise RuntimeError("Quantized weight is present but no linear strategy is configured.") - # For int4 (W4A16), we need to pass original_in_features - weight_format = getattr(strategy, "linear_weight_format", None) - kwargs = {"quant_scales": self.quant_scales} - if weight_format == "int4": - # For int4, packed weight shape is [out_features, (in_features + 1) // 2] - # We use x.shape[1] as the source of truth (it's the actual K dimension) - kwargs["original_in_features"] = x.shape[1] - base_out = strategy.linear_forward( - x, - self.quant_weight_int8, - self.bias, - quant_kind=self.quant_kind, - **kwargs, - ) - elif strategy is None: - base_out = F.linear(x, self.weight, self.bias) - else: - # For int4 strategies (W4A16/W4A8), we need to pass original_in_features even when weight is not quantized yet - weight_format = getattr(strategy, "linear_weight_format", None) - kwargs = {} - if weight_format == "int4": - kwargs["original_in_features"] = x.shape[1] - base_out = strategy.linear_forward(x, self.weight, self.bias, quant_kind=self.quant_kind, **kwargs) + base_out = self._forward_base(x, self.bias) return self.lora_forward(x, base_out) @@ -1107,113 +1051,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): def forward(self, x: torch.Tensor) -> torch.Tensor: bias = self.bias if self.tp_rank == 0 else None - strategy = get_linear_strategy(self.quant_kind) - # Runtime safety net: ensure we don't keep bf16+quant weights both resident. - self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) - - # Check for offline quantized weights (GPTQ/AWQ) first - if self.has_offline_quantized_weight(): - if strategy is None: - raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") - format_val = int(self._offline_quant_format.item()) - out_features = int(self._offline_quant_out_features.item()) - in_features = int(self._offline_quant_in_features.item()) - group_size = int(self._offline_quant_group_size.item()) - weight_format = getattr(strategy, "linear_weight_format", None) - - kwargs = { - "out_features": out_features, - "in_features": in_features, - "group_size": group_size, - } - - if format_val == 1: # GPTQ - if weight_format == "gptq": - # vLLM requires gptq_shuffle before first gptq_gemm. - self._maybe_prepare_offline_gptq(x) - kwargs.update({ - "gptq_qweight": self.gptq_qweight, - "gptq_qzeros": self.gptq_qzeros, - "gptq_scales": self.gptq_scales, - "gptq_group_size": group_size, - }) - # Always pass g_idx (can be empty); strategy will normalize dtype/device. - kwargs["gptq_g_idx"] = self.gptq_g_idx - elif weight_format == "gptq_marlin": - self._maybe_prepare_offline_gptq_marlin(x) - bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 - if bits <= 0: - pack_factor = in_features // int(self.gptq_qweight.shape[0]) - bits = 32 // pack_factor - kwargs["gptq_weight_bits"] = bits - kwargs.update({ - "gptq_marlin_qweight": self.gptq_marlin_qweight, - "gptq_marlin_scales": self.gptq_marlin_scales, - "gptq_marlin_zp": self.gptq_marlin_zp, - "gptq_marlin_g_idx": self.gptq_marlin_g_idx, - "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices, - "gptq_marlin_workspace": self.gptq_marlin_workspace, - }) - else: - raise RuntimeError( - f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} " - "is not compatible." - ) - elif format_val == 2: # AWQ - if weight_format == "awq": - kwargs.update({ - "awq_qweight": self.awq_qweight, - "awq_qzeros": self.awq_qzeros, - "awq_scales": self.awq_scales, - "awq_group_size": group_size, - }) - elif weight_format == "awq_marlin": - self._maybe_prepare_offline_awq_marlin(x) - kwargs.update({ - "awq_marlin_qweight": self.awq_marlin_qweight, - "awq_marlin_scales": self.awq_marlin_scales, - "awq_marlin_zp": self.awq_marlin_zp, - "awq_marlin_workspace": self.awq_marlin_workspace, - "awq_weight_bits": 4, - }) - else: - raise RuntimeError( - f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} " - "is not compatible." - ) - - y = strategy.linear_forward( - x, - None, # weight not used for offline quantized weights - bias, - quant_kind=self.quant_kind, - **kwargs, - ) - elif self.has_quantized_weight(): - if strategy is None: - raise RuntimeError("Quantized weight is present but no linear strategy is configured.") - # For int4 (W4A16), we must pass original_in_features to disambiguate packed K. - weight_format = getattr(strategy, "linear_weight_format", None) - kwargs = {"quant_scales": self.quant_scales} - if weight_format == "int4": - # Use activation K as the source of truth (it's the actual K dimension). - kwargs["original_in_features"] = x.shape[1] - y = strategy.linear_forward( - x, - self.quant_weight_int8, - bias, - quant_kind=self.quant_kind, - **kwargs, - ) - elif strategy is None: - y = F.linear(x, self.weight, bias) - else: - # For int4 strategies (W4A16/W4A8), we need to pass original_in_features even when weight is not quantized yet - weight_format = getattr(strategy, "linear_weight_format", None) - kwargs = {} - if weight_format == "int4": - kwargs["original_in_features"] = x.shape[1] - y = strategy.linear_forward(x, self.weight, bias, quant_kind=self.quant_kind, **kwargs) + y = self._forward_base(x, bias) if self.tp_size > 1: dist.all_reduce(y) return self.lora_forward(x, y) diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py index 622e7e2..73ffb92 100755 --- a/diffulex/utils/loader.py +++ b/diffulex/utils/loader.py @@ -226,38 +226,78 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): if not (use_gptq or use_awq): return loaded_gptq, loaded_awq, skipped - # Collect all weight names from safetensors files - all_keys = [] all_files = list(glob(os.path.join(config.model, "*.safetensors"))) + + # Scan keys once and remember which file contains each key. + # This avoids the O(num_modules * num_files) "search every file for every module" pattern below. + key_to_file: dict[str, str] = {} + module_keys: dict[str, dict[str, str]] = {} + offline_suffixes = (".qweight", ".qzeros", ".scales", ".g_idx") for file in all_files: with safe_open(file, "pt", "cpu") as f: - all_keys.extend(f.keys()) - - # Group keys by module prefix - module_keys: dict[str, dict[str, str]] = {} - for key in all_keys: - # Check for GPTQ/AWQ keys: {prefix}.qweight, {prefix}.qzeros, {prefix}.scales, {prefix}.g_idx (GPTQ only) - if key.endswith(".qweight"): - prefix = key[:-8] # Remove ".qweight" - if prefix not in module_keys: - module_keys[prefix] = {} - module_keys[prefix]["qweight"] = key - elif key.endswith(".qzeros"): - prefix = key[:-7] # Remove ".qzeros" - if prefix not in module_keys: - module_keys[prefix] = {} - module_keys[prefix]["qzeros"] = key - elif key.endswith(".scales"): - prefix = key[:-7] # Remove ".scales" - if prefix not in module_keys: - module_keys[prefix] = {} - module_keys[prefix]["scales"] = key - elif key.endswith(".g_idx"): - prefix = key[:-6] # Remove ".g_idx" - if prefix not in module_keys: - module_keys[prefix] = {} - module_keys[prefix]["g_idx"] = key + for key in f.keys(): + if not key.endswith(offline_suffixes): + continue + key_to_file[key] = file + # Group by module prefix: {prefix}.qweight, {prefix}.qzeros, {prefix}.scales, {prefix}.g_idx (GPTQ only) + if key.endswith(".qweight"): + prefix = key[:-8] + module_keys.setdefault(prefix, {})["qweight"] = key + elif key.endswith(".qzeros"): + prefix = key[:-7] + module_keys.setdefault(prefix, {})["qzeros"] = key + elif key.endswith(".scales"): + prefix = key[:-7] + module_keys.setdefault(prefix, {})["scales"] = key + else: # .g_idx + prefix = key[:-6] + module_keys.setdefault(prefix, {})["g_idx"] = key + + # Cache modules lookup to avoid rebuilding dict(model.named_modules()) repeatedly. + named_modules = dict(model.named_modules()) + offline_capable_modules: dict[str, nn.Module] = { + name: m for name, m in named_modules.items() if hasattr(m, "set_offline_quantized_weight") + } + def _find_offline_capable_module(module_name: str) -> nn.Module | None: + """Best-effort resolve module_name to a module with offline quant support.""" + m = offline_capable_modules.get(module_name) + if m is not None: + return m + + # Try a few naming fallbacks (keep behavior compatible with the previous implementation). + leaf = module_name.split(".")[-1] if module_name else module_name + for name, cand in offline_capable_modules.items(): + if ( + name == module_name + or name.endswith("." + module_name) + or module_name.endswith("." + name) + or (name.split(".")[-1] == leaf) + ): + return cand + return None + + def _load_tensors_for_prefix(key_dict: dict[str, str], *, want_g_idx: bool) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]: + """Load qweight/qzeros/scales/(g_idx) from the minimal set of safetensors files.""" + qweight = qzeros = scales = g_idx = None + keys = [key_dict.get("qweight"), key_dict.get("qzeros"), key_dict.get("scales")] + if want_g_idx: + keys.append(key_dict.get("g_idx")) + files_needed = {key_to_file.get(k) for k in keys if k} + files_needed.discard(None) + + for file in files_needed: + with safe_open(file, "pt", "cpu") as f: + if qweight is None and (key_dict.get("qweight") in f.keys()): + qweight = f.get_tensor(key_dict["qweight"]) + if qzeros is None and (key_dict.get("qzeros") in f.keys()): + qzeros = f.get_tensor(key_dict["qzeros"]) + if scales is None and (key_dict.get("scales") in f.keys()): + scales = f.get_tensor(key_dict["scales"]) + if want_g_idx and g_idx is None and ("g_idx" in key_dict) and (key_dict["g_idx"] in f.keys()): + g_idx = f.get_tensor(key_dict["g_idx"]) + return qweight, qzeros, scales, g_idx + # Load GPTQ/AWQ weights for each module packed_modules_mapping = getattr(model, "packed_modules_mapping", {}) @@ -272,31 +312,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): module_name = prefix.replace(k, v) break - # Try to find the module try: - module = None - # Try exact match first - try: - module = dict(model.named_modules())[module_name] - if not hasattr(module, "set_offline_quantized_weight"): - module = None - except KeyError: - pass - - # Try partial match if exact match failed - if module is None: - for name, m in model.named_modules(): - # Handle different naming conventions - if ( - name == module_name - or name.endswith("." + module_name) - or module_name.endswith("." + name) - or (name.split(".")[-1] == module_name.split(".")[-1]) - ): - if hasattr(m, "set_offline_quantized_weight"): - module = m - break - + module = _find_offline_capable_module(module_name) if module is None: skipped += 1 continue @@ -316,27 +333,10 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): skipped += 1 continue - # Load tensors from safetensors files - qweight = None - qzeros = None - scales = None - g_idx = None - - for file in all_files: - with safe_open(file, "pt", "cpu") as f: - if key_dict["qweight"] in f.keys() and qweight is None: - qweight = f.get_tensor(key_dict["qweight"]) - if key_dict["qzeros"] in f.keys() and qzeros is None: - qzeros = f.get_tensor(key_dict["qzeros"]) - if key_dict["scales"] in f.keys() and scales is None: - scales = f.get_tensor(key_dict["scales"]) - if format == "gptq" and "g_idx" in key_dict and key_dict["g_idx"] in f.keys() and g_idx is None: - g_idx = f.get_tensor(key_dict["g_idx"]) - - # Early exit if all required tensors are loaded - if qweight is not None and qzeros is not None and scales is not None: - if format != "gptq" or g_idx is not None: - break + # Load tensors from the minimal set of safetensors files. + qweight, qzeros, scales, g_idx = _load_tensors_for_prefix( + key_dict, want_g_idx=(format == "gptq") + ) if qweight is None or qzeros is None or scales is None: skipped += 1 @@ -352,8 +352,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1]) in_features = int(qweight.shape[0]) * 16 if ckpt_bits not in (4, 8): - print( - f"Warning: gptq_marlin requires bits=4/8, got bits={ckpt_bits} for {module_name}. Skipping." + logger.warning( + f"gptq_marlin requires bits=4/8, got bits={ckpt_bits} for {module_name}. Skipping." ) skipped += 1 continue @@ -365,17 +365,17 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # qzeros: [K/group, N/pack] (may be empty for some checkpoints) if getattr(qzeros, "numel", lambda: 1)() == 0: if ckpt_bits not in (2, 4, 8): - print( - f"Warning: qzeros is empty and cannot infer bits for {module_name}. " - f"Please ensure quantize_config.json contains bits (2/4/8). Skipping." + logger.warning( + f"qzeros is empty and cannot infer bits for {module_name}. " + "Please ensure quantize_config.json contains bits (2/4/8). Skipping." ) skipped += 1 continue pack_factor = 32 // int(ckpt_bits) else: if int(qzeros.shape[1]) <= 0 or out_features % int(qzeros.shape[1]) != 0: - print( - f"Warning: Cannot infer GPTQ pack_factor from qzeros for {module_name}: " + logger.warning( + f"Cannot infer GPTQ pack_factor from qzeros for {module_name}: " f"qzeros.shape={tuple(qzeros.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping." ) skipped += 1 @@ -386,8 +386,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # awq: qweight: [K, N/pack], scales: [K/group, N] out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1]) if int(qweight.shape[1]) <= 0 or out_features % int(qweight.shape[1]) != 0: - print( - f"Warning: Cannot infer AWQ pack_factor from scales/qweight for {module_name}: " + logger.warning( + f"Cannot infer AWQ pack_factor from scales/qweight for {module_name}: " f"scales.shape={tuple(scales.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping." ) skipped += 1 @@ -428,9 +428,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): ): group_size_norm = in_features if group_size == -1 else group_size if group_size_norm <= 0 or (in_features % group_size_norm) != 0: - print( - f"Warning: Invalid group_size={group_size} for {module_name} with in_features={in_features}. " - "Skipping." + logger.warning( + f"Invalid group_size={group_size} for {module_name} with in_features={in_features}. Skipping." ) skipped += 1 continue @@ -443,7 +442,7 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): device=qweight.device, ) except Exception as e: - print(f"Warning: Failed to create dummy qzeros for {module_name}: {e}. Skipping.") + logger.warning(f"Failed to create dummy qzeros for {module_name}: {e}. Skipping.") skipped += 1 continue @@ -455,9 +454,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): tp_dim = getattr(module, "tp_dim", None) if tp_size > 1: if tp_dim not in (0, 1): - print( - f"Warning: Unsupported tp_dim={tp_dim} for offline quantized weights. " - f"Skipping {module_name}." + logger.warning( + f"Unsupported tp_dim={tp_dim} for offline quantized weights. Skipping {module_name}." ) skipped += 1 continue @@ -465,8 +463,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # Shard along output features (N) for column-parallel modules. if tp_dim == 0: if out_features % tp_size != 0: - print( - f"Warning: out_features={out_features} not divisible by TP={tp_size} for {module_name}. " + logger.warning( + f"out_features={out_features} not divisible by TP={tp_size} for {module_name}. " "Skipping offline quant weights for this module." ) skipped += 1 @@ -475,8 +473,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): out_start = tp_rank * out_per out_end = out_start + out_per if out_per % pack_factor != 0: - print( - f"Warning: out_features_per_partition={out_per} not divisible by pack_factor={pack_factor} " + logger.warning( + f"out_features_per_partition={out_per} not divisible by pack_factor={pack_factor} " f"for {module_name}. Skipping." ) skipped += 1 @@ -490,7 +488,9 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # Marlin qweight packs N by a factor (bits/2): N_packed = N * (bits/2) n_factor = int(ckpt_bits) // 2 if n_factor <= 0: - print(f"Warning: invalid gptq_marlin n_factor for bits={ckpt_bits} ({module_name}). Skipping.") + logger.warning( + f"invalid gptq_marlin n_factor for bits={ckpt_bits} ({module_name}). Skipping." + ) skipped += 1 continue qweight = qweight[:, (out_start * n_factor):(out_end * n_factor)] @@ -516,8 +516,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # Shard along input features (K) for row-parallel modules. elif tp_dim == 1: if in_features % tp_size != 0: - print( - f"Warning: in_features={in_features} not divisible by TP={tp_size} for {module_name}. " + logger.warning( + f"in_features={in_features} not divisible by TP={tp_size} for {module_name}. " "Skipping offline quant weights for this module." ) skipped += 1 @@ -526,8 +526,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): in_start = tp_rank * in_per in_end = in_start + in_per if group_size <= 0 or (in_per % group_size) != 0 or (in_start % group_size) != 0: - print( - f"Warning: group_size={group_size} incompatible with TP sharding for {module_name} " + logger.warning( + f"group_size={group_size} incompatible with TP sharding for {module_name} " f"(in_per={in_per}, in_start={in_start}). Skipping." ) skipped += 1 @@ -539,8 +539,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): if is_gptq_marlin_ckpt: # Marlin qweight packs K in tiles of 16: K_packed = K / 16 if in_start % 16 != 0: - print( - f"Warning: gptq_marlin requires in_start divisible by 16, got in_start={in_start} " + logger.warning( + f"gptq_marlin requires in_start divisible by 16, got in_start={in_start} " f"for {module_name}. Skipping." ) skipped += 1 @@ -553,8 +553,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): group_size_norm = in_features if group_size == -1 else group_size expected_num_groups = in_features // group_size_norm if group_size_norm > 0 else 0 if expected_num_groups <= 0: - print( - f"Warning: invalid expected_num_groups={expected_num_groups} for {module_name}. Skipping." + logger.warning( + f"invalid expected_num_groups={expected_num_groups} for {module_name}. Skipping." ) skipped += 1 continue @@ -564,8 +564,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): # Legacy/alternate layout: [2*num_groups, N/2] scales = scales[(2 * g_start):(2 * g_end), :] else: - print( - f"Warning: unexpected gptq_marlin scales.shape[0]={int(scales.shape[0])} " + logger.warning( + f"unexpected gptq_marlin scales.shape[0]={int(scales.shape[0])} " f"(expected {expected_num_groups} or {2*expected_num_groups}) for {module_name}. Skipping." ) skipped += 1 @@ -576,8 +576,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): else: # qweight: [K/pack, N] (packed on K) if in_start % pack_factor != 0: - print( - f"Warning: in_start={in_start} not divisible by pack_factor={pack_factor} " + logger.warning( + f"in_start={in_start} not divisible by pack_factor={pack_factor} " f"for {module_name}. Skipping." ) skipped += 1 @@ -632,15 +632,11 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config): else: loaded_awq += 1 except Exception as e: - print(f"Failed to load offline quantized weights for {module_name}: {e}") - import traceback - traceback.print_exc() + logger.exception(f"Failed to load offline quantized weights for {module_name}: {e}") skipped += 1 except Exception as e: - print(f"Error loading offline quantized weights for {prefix}: {e}") - import traceback - traceback.print_exc() + logger.exception(f"Error loading offline quantized weights for {prefix}: {e}") skipped += 1 return loaded_gptq, loaded_awq, skipped diff --git a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py index e1b085e..870a860 100644 --- a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py @@ -89,6 +89,24 @@ def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, A packed = self._pack_int4_to_int8(q) return packed, {"scales": scales} + def quantize_weight_for_kernel( + self, + weight: torch.Tensor, + *, + device: torch.device | None = None, + **_: Any, + ) -> tuple[torch.Tensor, Any]: + """Quantize+pack bf16 weight for kernel consumption. + + Returns: + (packed_int8 [N, ceil(K/2)], scales_fp32 [N]) + """ + packed, meta = self.quantize(weight) + if device is not None: + packed = packed.to(device=device) + meta["scales"] = meta["scales"].to(device=device) + return packed, meta["scales"] + def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor: original_k = int(kwargs.get("original_in_features", 0)) if original_k <= 0: diff --git a/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml deleted file mode 100644 index bae9875..0000000 --- a/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ Marlin (W2, A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 2048 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ Marlin (W2) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq_marlin" - linear_mlp_weight_dtype: "gptq_marlin" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_marlin_w2_bf16kv" - save_results: true - use_tqdm: true From 23d377a9624b8600ac3a8486a46b2f7c6e9c8b77 Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Sat, 24 Jan 2026 09:17:01 +0000 Subject: [PATCH 05/10] =?UTF-8?q?fix:=20=E4=BF=AE=E6=AD=A3=20bench=20?= =?UTF-8?q?=E4=B8=AD=20prefill/decode=20=E5=90=9E=E5=90=90=E9=87=8F?= =?UTF-8?q?=E7=9A=84=E5=B9=B3=E5=9D=87=E5=80=BC=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 将最后总结从最后一步的瞬时吞吐改为真正的平均值(总token/总时间) - 新增 ms/step 统计信息,便于分析性能 - 修复了之前只显示最后一步瞬时值而非平均值的问题 --- diffulex/engine/tp_worker.py | 63 ++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/diffulex/engine/tp_worker.py b/diffulex/engine/tp_worker.py index 0f46edf..6b6df33 100755 --- a/diffulex/engine/tp_worker.py +++ b/diffulex/engine/tp_worker.py @@ -102,21 +102,40 @@ def generate( sid = self.add_request(prompt, sp) seqid_to_idx[sid] = idx outputs = [None] * len(prompts) - prefill_throughput = decode_throughput = 0. + # Track per-step instantaneous throughput for display, and + # token/time totals for correct average throughput reporting. + last_prefill_throughput = 0.0 + last_decode_throughput = 0.0 + prefill_total_tokens = 0 + decode_total_tokens = 0 + prefill_total_time = 0.0 + decode_total_time = 0.0 + prefill_steps = 0 + decode_steps = 0 n_steps = 0 n_diff_steps = [-1] * len(prompts) while not self.is_finished(): - t = perf_counter() n_steps += 1 + t = perf_counter() output, num_tokens, is_prefill, cur_n_diff_steps, _ = self.step() + dt = perf_counter() - t + + # Accumulate totals to compute average throughput correctly. + if is_prefill: + prefill_steps += 1 + prefill_total_tokens += int(num_tokens) + prefill_total_time += float(dt) + last_prefill_throughput = (num_tokens / dt) if dt > 0 else 0.0 + else: + decode_steps += 1 + decode_total_tokens += int(num_tokens) + decode_total_time += float(dt) + last_decode_throughput = (num_tokens / dt) if dt > 0 else 0.0 + if use_tqdm: - if is_prefill: - prefill_throughput = num_tokens / (perf_counter() - t) - else: - decode_throughput = num_tokens / (perf_counter() - t) pbar.set_postfix({ - "Prefill": f"{int(prefill_throughput)}tok/s", - "Decode": f"{int(decode_throughput)}tok/s", + "Prefill": f"{int(last_prefill_throughput)}tok/s", + "Decode": f"{int(last_decode_throughput)}tok/s", }) if cur_n_diff_steps: for seq_id, n_step in cur_n_diff_steps.items(): @@ -128,9 +147,33 @@ def generate( if use_tqdm: pbar.update(1) + avg_prefill_throughput = ( + prefill_total_tokens / prefill_total_time if prefill_total_time > 0 else 0.0 + ) + avg_decode_throughput = ( + decode_total_tokens / decode_total_time if decode_total_time > 0 else 0.0 + ) + avg_prefill_step_ms = ( + (prefill_total_time / prefill_steps) * 1000.0 if prefill_steps > 0 else 0.0 + ) + avg_decode_step_ms = ( + (decode_total_time / decode_steps) * 1000.0 if decode_steps > 0 else 0.0 + ) logger.info( - f"Finished in {n_steps} steps, prefill throughput: {prefill_throughput:.2f} tok/s, " - f"decode throughput: {decode_throughput:.2f} tok/s" + "Finished in %d steps (prefill=%d, decode=%d). " + "Prefill: %d tok in %.2fs (avg %.2f tok/s, %.2f ms/step). " + "Decode: %d tok in %.2fs (avg %.2f tok/s, %.2f ms/step).", + n_steps, + prefill_steps, + decode_steps, + prefill_total_tokens, + prefill_total_time, + avg_prefill_throughput, + avg_prefill_step_ms, + decode_total_tokens, + decode_total_time, + avg_decode_throughput, + avg_decode_step_ms, ) # Ensure all outputs are present assert all(toks is not None for toks in outputs), "Some sequences did not produce outputs" From 896b8dfe9f065208c305176fba06cbd32eba1c6f Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Sun, 25 Jan 2026 07:19:47 +0000 Subject: [PATCH 06/10] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E9=87=8F?= =?UTF-8?q?=E5=8C=96=20linear=20fast=20path=20=E5=B9=B6=E7=A7=BB=E9=99=A4?= =?UTF-8?q?=20profiler=20=E6=A0=87=E6=B3=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 量化 linear:去 kwargs/pop/重复可用性检查,缓存 out_features 与必要中间张量 - 直连 vLLM CUDA ops(W8A8/GPTQ/AWQ/Marlin 等)以降低 Python glue 开销 - load-time 处理 qweight/scales 的布局与 contiguous,避免 forward 里重复处理 - 移除 linear.py 中 profiler record 标注,保持代码简洁 - 补充 trace/profile 辅助分析脚本与相关测试 --- diffulex/engine/tp_worker.py | 17 +- diffulex/layer/linear.py | 206 +++- diffulex/sampler/dream.py | 8 +- diffulex/sampler/fast_dllm_v2.py | 16 +- diffulex/sampler/llada.py | 7 +- diffulex/sampler/sdar.py | 16 +- diffulex/strategy/d2f/engine/scheduler.py | 11 +- diffulex/strategy/d2f/engine/sequence.py | 4 +- .../strategies/linear_awq_marlin_w4a16.py | 109 +- .../strategies/linear_awq_w4a16.py | 54 +- .../strategies/linear_gptq_marlin_w4a16.py | 142 ++- .../strategies/linear_gptq_w4a16.py | 101 +- .../strategies/linear_int8_w8a8.py | 64 +- .../strategies/linear_marlin_int8_w8a16.py | 209 ++-- .../python/dllm_flash_attn_kernels.py | 1000 +++-------------- .../dllm_flash_attn_prefill_tilelang.py | 250 +++++ .../python/paged_attn_decode_triton.py | 661 +++++++++++ profile/analyze_trace_bottlenecks.py | 298 +++++ profile/analyze_trace_cpu_ops.py | 149 +++ profile/analyze_trace_gemm_shapes.py | 309 +++++ .../kernel/test_paged_attn_decode_triton.py | 240 ++++ 21 files changed, 2724 insertions(+), 1147 deletions(-) create mode 100644 diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py create mode 100644 diffulex_kernel/python/paged_attn_decode_triton.py create mode 100644 profile/analyze_trace_bottlenecks.py create mode 100644 profile/analyze_trace_cpu_ops.py create mode 100644 profile/analyze_trace_gemm_shapes.py create mode 100644 test/python/kernel/test_paged_attn_decode_triton.py diff --git a/diffulex/engine/tp_worker.py b/diffulex/engine/tp_worker.py index 6b6df33..ba65d67 100755 --- a/diffulex/engine/tp_worker.py +++ b/diffulex/engine/tp_worker.py @@ -102,10 +102,7 @@ def generate( sid = self.add_request(prompt, sp) seqid_to_idx[sid] = idx outputs = [None] * len(prompts) - # Track per-step instantaneous throughput for display, and - # token/time totals for correct average throughput reporting. - last_prefill_throughput = 0.0 - last_decode_throughput = 0.0 + # Track token/time totals for correct average throughput reporting. prefill_total_tokens = 0 decode_total_tokens = 0 prefill_total_time = 0.0 @@ -125,17 +122,21 @@ def generate( prefill_steps += 1 prefill_total_tokens += int(num_tokens) prefill_total_time += float(dt) - last_prefill_throughput = (num_tokens / dt) if dt > 0 else 0.0 else: decode_steps += 1 decode_total_tokens += int(num_tokens) decode_total_time += float(dt) - last_decode_throughput = (num_tokens / dt) if dt > 0 else 0.0 if use_tqdm: + avg_prefill_throughput = ( + prefill_total_tokens / prefill_total_time if prefill_total_time > 0 else 0.0 + ) + avg_decode_throughput = ( + decode_total_tokens / decode_total_time if decode_total_time > 0 else 0.0 + ) pbar.set_postfix({ - "Prefill": f"{int(last_prefill_throughput)}tok/s", - "Decode": f"{int(last_decode_throughput)}tok/s", + "Prefill(avg)": f"{int(avg_prefill_throughput)}tok/s", + "Decode(avg)": f"{int(avg_decode_throughput)}tok/s", }) if cur_n_diff_steps: for seq_id, n_step in cur_n_diff_steps.items(): diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py index f26566d..e3581e9 100755 --- a/diffulex/layer/linear.py +++ b/diffulex/layer/linear.py @@ -78,6 +78,9 @@ def __init__( super().__init__() self.input_size = input_size self.output_size = output_size + # Cache forward output features (avoid per-call inference). + # Subclasses with TP partitions should overwrite this after partition sizes are known. + self._forward_out_features: int = int(output_size) self.tp_dim = tp_dim self.quant_kind = (quant_kind or "other").strip().lower() or "other" self.tp_rank = dist.get_rank() @@ -86,6 +89,8 @@ def __init__( # NOTE: We keep these as buffers so they move with the module and do not appear as Parameters. self.register_buffer("quant_weight_int8", torch.empty(0, dtype=torch.int8), persistent=False) self.register_buffer("quant_scales", torch.empty(0, dtype=torch.bfloat16), persistent=False) + # Cache a 1xN view of scales to avoid per-call view/shape handling on hot paths. + self.register_buffer("quant_scales_1xn", torch.empty(0, dtype=torch.bfloat16), persistent=False) self.register_buffer("_weight_is_quantized", torch.tensor(False, dtype=torch.bool), persistent=False) # GPTQ/AWQ offline quantized weight storage (W4A16). @@ -243,6 +248,13 @@ def _infer_module_device() -> torch.device: if g_idx is not None and g_idx.device != module_device: g_idx = g_idx.to(device=module_device) + # Make packed tensors contiguous once at load-time (avoid per-call checks/copies). + qweight = qweight.contiguous() + qzeros = qzeros.contiguous() + scales = scales.contiguous() + if g_idx is not None: + g_idx = g_idx.contiguous() + # group_size == -1 means channelwise in some ecosystems; vLLM normalizes -1 to K. group_size_norm = in_features if group_size == -1 else group_size if group_size_norm <= 0 or (in_features % group_size_norm != 0): @@ -458,8 +470,8 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: # g_idx (act-order) handling: marlin expects sorted g_idx + sort indices; otherwise empty. if self.gptq_g_idx.numel() > 0: g_idx_sorted, g_idx_sort_indices = marlin_sort_g_idx(self.gptq_g_idx.to(device=device, dtype=torch.int32)) - self.gptq_marlin_g_idx = g_idx_sorted - self.gptq_marlin_g_idx_sort_indices = g_idx_sort_indices + self.gptq_marlin_g_idx = g_idx_sorted.contiguous() + self.gptq_marlin_g_idx_sort_indices = g_idx_sort_indices.contiguous() else: self.gptq_marlin_g_idx = marlin_make_empty_g_idx(device) self.gptq_marlin_g_idx_sort_indices = marlin_make_empty_g_idx(device) @@ -476,7 +488,7 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: size_n=out_features, num_bits=weight_bits, is_a_8bit=False, - ) + ).contiguous() # Permute scales to marlin format. self.gptq_marlin_scales = marlin_permute_scales( @@ -485,7 +497,7 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: size_n=out_features, group_size=group_size, is_a_8bit=False, - ) + ).contiguous() # GPTQ Marlin only supports symmetric weights (no runtime zero-points). # Use empty zp to keep has_zp=False in the kernel. @@ -542,30 +554,30 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None: # Repack qweight to marlin format. self.awq_marlin_qweight = ops.awq_marlin_repack( - self.awq_qweight, + self.awq_qweight.contiguous(), size_k=in_features, size_n=out_features, num_bits=weight_bits, is_a_8bit=False, - ) + ).contiguous() # Permute scales to marlin format. self.awq_marlin_scales = marlin_permute_scales( - self.awq_scales, + self.awq_scales.contiguous(), size_k=in_features, size_n=out_features, group_size=group_size, is_a_8bit=False, - ) + ).contiguous() # Convert zero-points to marlin format. self.awq_marlin_zp = awq_to_marlin_zero_points( - self.awq_qzeros, + self.awq_qzeros.contiguous(), size_k=num_groups, size_n=out_features, num_bits=weight_bits, is_a_8bit=False, - ) + ).contiguous() self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device) @@ -598,19 +610,39 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to except Exception: strategy = None scale_dtype = torch.bfloat16 + force_weight_contig = True if strategy is not None: weight_format = getattr(strategy, "linear_weight_format", None) act_format = getattr(strategy, "linear_act_format", None) # FP8 W8A16 uses float32 scales if weight_format in ("fp8_e4m3", "fp8_e5m2") and act_format == "bf16": scale_dtype = torch.float32 - # FP8 W8A8 and int8 W8A8 use float16 scales - elif act_format in ("int8", "fp8_e4m3", "fp8_e5m2"): + # W8A8 int8 uses float32 [1, N] weight scales in vLLM cutlass_scaled_mm path. + elif weight_format == "int8" and act_format == "int8": + scale_dtype = torch.float32 + # vLLM CUTLASS scaled_mm expects int8 weight in KxN with stride(0)==1, + # which is typically produced as a transpose-view (non-contiguous). + # Do NOT force contiguous here; just avoid per-call conversions. + force_weight_contig = False + # FP8 W8A8 keeps float32 scales; also keep KxN transpose-view layout. + elif act_format in ("fp8_e4m3", "fp8_e5m2"): + scale_dtype = torch.float32 + force_weight_contig = False + # Other int8/int4 mixed paths use float16 scales by default. + elif act_format == "int8": scale_dtype = torch.float16 if quant_scales.dtype != scale_dtype: quant_scales = quant_scales.to(dtype=scale_dtype) + # Make sure scales are contiguous once at load-time. + # NOTE: Some kernels require specific non-contiguous weight layouts (e.g., W8A8 KxN with stride(0)==1). + # We avoid per-call `is_contiguous/contiguous` checks while preserving required layouts. + if force_weight_contig: + quant_weight_int8 = quant_weight_int8.contiguous() + quant_scales = quant_scales.contiguous() self.quant_weight_int8 = quant_weight_int8 self.quant_scales = quant_scales + # 1xN view for fused kernels expecting 2D scales. + self.quant_scales_1xn = quant_scales if quant_scales.dim() == 2 else quant_scales.view(1, -1) self._weight_is_quantized.fill_(True) def _maybe_promote_weight_to_quantized_at_runtime( @@ -738,13 +770,13 @@ def _infer_gptq_weight_bits(self, *, in_features: int) -> int: raise RuntimeError(f"GPTQ bits 推断失败:pack_factor={pack_factor} 不满足 32%pack_factor==0") return 32 // pack_factor - def _maybe_int4_original_in_features_kwargs(self, strategy, x: torch.Tensor) -> dict: + def _maybe_int4_original_in_features_kwargs(self, strategy, x: torch.Tensor) -> Optional[dict]: """Some int4 kernels need original K (before packing).""" if strategy is None: - return {} + return None if getattr(strategy, "linear_weight_format", None) == "int4": return {"original_in_features": x.shape[1]} - return {} + return None def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict: """Build kwargs for offline GPTQ/AWQ (including Marlin variants).""" @@ -830,10 +862,90 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch. if self.has_offline_quantized_weight(): if strategy is None: raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") + weight_format = getattr(strategy, "linear_weight_format", None) + out_features, in_features, group_size = self._offline_meta() + + # Avoid per-call kwargs dict construction on hot paths. + if weight_format == "gptq": + self._maybe_prepare_offline_gptq(x) + bits = self._infer_gptq_weight_bits(in_features=in_features) + return strategy.linear_forward( + x, + None, # weight not used for offline quantized weights + bias, + quant_kind=self.quant_kind, + gptq_qweight=self.gptq_qweight, + gptq_qzeros=self.gptq_qzeros, + gptq_scales=self.gptq_scales, + gptq_g_idx=self.gptq_g_idx, + weight_bits=bits, + use_v2_format=False, + out_features=out_features, + in_features=in_features, + group_size=group_size, + ) + + if weight_format == "awq": + # AWQ is 4-bit only in vLLM; bits stored in _offline_quant_bits. + bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 4 + pack_factor = 32 // max(1, bits) + return strategy.linear_forward( + x, + None, + bias, + quant_kind=self.quant_kind, + awq_qweight=self.awq_qweight, + awq_qzeros=self.awq_qzeros, + awq_scales=self.awq_scales, + pack_factor=pack_factor, + out_features=out_features, + in_features=in_features, + group_size=group_size, + ) + + if weight_format == "gptq_marlin": + self._maybe_prepare_offline_gptq_marlin(x) + bits = self._infer_gptq_weight_bits(in_features=in_features) + return strategy.linear_forward( + x, + None, + bias, + quant_kind=self.quant_kind, + qweight=self.gptq_marlin_qweight, + scales=self.gptq_marlin_scales, + zp=self.gptq_marlin_zp, + g_idx=self.gptq_marlin_g_idx, + g_idx_sort_indices=self.gptq_marlin_g_idx_sort_indices, + workspace=self.gptq_marlin_workspace, + in_features=in_features, + out_features=out_features, + group_size=group_size, + weight_bits=bits, + tp_dim=self.tp_dim, + ) + + if weight_format == "awq_marlin": + self._maybe_prepare_offline_awq_marlin(x) + return strategy.linear_forward( + x, + None, + bias, + quant_kind=self.quant_kind, + qweight=self.awq_marlin_qweight, + scales=self.awq_marlin_scales, + zp=self.awq_marlin_zp, + workspace=self.awq_marlin_workspace, + in_features=in_features, + out_features=out_features, + group_size=group_size, + tp_dim=self.tp_dim, + ) + + # Fallback: compatibility for any remaining strategies. kwargs = self._build_offline_forward_kwargs(x, strategy) return strategy.linear_forward( x, - None, # weight not used for offline quantized weights + None, bias, quant_kind=self.quant_kind, **kwargs, @@ -842,14 +954,65 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch. if self.has_quantized_weight(): if strategy is None: raise RuntimeError("Quantized weight is present but no linear strategy is configured.") - kwargs = {"quant_scales": self.quant_scales} - kwargs.update(self._maybe_int4_original_in_features_kwargs(strategy, x)) + # Hot path: avoid per-call dict construction when possible. + extra_kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x) + # W8A16(AllSpark) expects scales in 1xN layout and needs explicit N. + if getattr(strategy, "name", "") == "linear_int8_w8a16": + if extra_kwargs: + return strategy.linear_forward( + x, + self.quant_weight_int8, + bias, + quant_kind=self.quant_kind, + quant_scales=self.quant_scales_1xn, + out_features=self._forward_out_features, + **extra_kwargs, + ) + return strategy.linear_forward( + x, + self.quant_weight_int8, + bias, + quant_kind=self.quant_kind, + quant_scales=self.quant_scales_1xn, + out_features=self._forward_out_features, + ) + + # W8A8 expects scales in 1xN layout and is sensitive to weight layout (KxN stride0==1). + if getattr(strategy, "name", "") == "linear_int8_w8a8": + if extra_kwargs: + return strategy.linear_forward( + x, + self.quant_weight_int8, + bias, + quant_kind=self.quant_kind, + quant_scales=self.quant_scales_1xn, + out_features=self._forward_out_features, + **extra_kwargs, + ) + return strategy.linear_forward( + x, + self.quant_weight_int8, + bias, + quant_kind=self.quant_kind, + quant_scales=self.quant_scales_1xn, + out_features=self._forward_out_features, + ) + + if extra_kwargs: + return strategy.linear_forward( + x, + self.quant_weight_int8, + bias, + quant_kind=self.quant_kind, + quant_scales=self.quant_scales, + **extra_kwargs, + ) return strategy.linear_forward( x, self.quant_weight_int8, bias, quant_kind=self.quant_kind, - **kwargs, + quant_scales=self.quant_scales, ) if strategy is None: @@ -862,7 +1025,9 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch. if weight is None: raise RuntimeError("Strategy is configured but weight is missing (expected bf16 weight).") kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x) - return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs) + if kwargs: + return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs) + return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind) def forward(self, x: torch.Tensor) -> torch.Tensor: raise NotImplementedError @@ -915,6 +1080,7 @@ def __init__( LinearBase.__init__(self, input_size, output_size, 0, quant_kind) self.input_size_per_partition = input_size self.output_size_per_partition = divide(output_size, self.tp_size) + self._forward_out_features = int(self.output_size_per_partition) self.weight = nn.Parameter(torch.empty(self.output_size_per_partition, self.input_size)) self.weight.weight_loader = self.weight_loader diff --git a/diffulex/sampler/dream.py b/diffulex/sampler/dream.py index 9f06340..1ff85c6 100644 --- a/diffulex/sampler/dream.py +++ b/diffulex/sampler/dream.py @@ -56,9 +56,11 @@ def forward(self, logits: torch.Tensor, temperatures: torch.Tensor, high_conf_indices = torch.where(initial_confidence > block.accept_threshold)[0] accepted_ids = high_conf_indices - true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist()] - accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist() - sampled_tokens_sub_map[str(block_id)] = sampled_tokens + # Avoid calling `.tolist()` on CUDA tensors directly (can trigger many per-element DtoH syncs). + accepted_ids_list = accepted_ids.to(device="cpu").tolist() + true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list] + accepted_ids_sub_map[str(block_id)] = accepted_ids_list + sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist() seq_idx = str(seq.seq_id) true_local_ids_map[seq_idx] = true_local_ids_sub_map diff --git a/diffulex/sampler/fast_dllm_v2.py b/diffulex/sampler/fast_dllm_v2.py index ec323b5..5726655 100644 --- a/diffulex/sampler/fast_dllm_v2.py +++ b/diffulex/sampler/fast_dllm_v2.py @@ -59,19 +59,15 @@ def forward(self, seqs: list[SequenceBase], logits: torch.Tensor, temperatures: if len(high_conf_indices) == 0: max_prob_idx = initial_confidence.argmax() - accepted_ids = torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long) + accepted_ids = max_prob_idx.view(1) else: max_prob_idx = initial_confidence.argmax() - accepted_ids = torch.unique(torch.cat([ - high_conf_indices, - torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long) - ])) + accepted_ids = torch.unique(torch.cat([high_conf_indices, max_prob_idx.view(1)])) - true_local_ids_sub_map[str(block_id)] = [ - block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist() - ] - accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist() - sampled_tokens_sub_map[str(block_id)] = sampled_tokens + accepted_ids_list = accepted_ids.to(device="cpu").tolist() + true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list] + accepted_ids_sub_map[str(block_id)] = accepted_ids_list + sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist() seq_idx = str(seq.seq_id) true_local_ids_map[seq_idx] = true_local_ids_sub_map diff --git a/diffulex/sampler/llada.py b/diffulex/sampler/llada.py index 5202fa1..fd11f44 100644 --- a/diffulex/sampler/llada.py +++ b/diffulex/sampler/llada.py @@ -52,9 +52,10 @@ def forward(self, logits: torch.Tensor, temperatures: torch.Tensor, high_conf_indices = torch.where(initial_confidence > block.accept_threshold)[0] accepted_ids = high_conf_indices - true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist()] - accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist() - sampled_tokens_sub_map[str(block_id)] = sampled_tokens + accepted_ids_list = accepted_ids.to(device="cpu").tolist() + true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list] + accepted_ids_sub_map[str(block_id)] = accepted_ids_list + sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist() seq_idx = str(seq.seq_id) true_local_ids_map[seq_idx] = true_local_ids_sub_map diff --git a/diffulex/sampler/sdar.py b/diffulex/sampler/sdar.py index 4eeb471..8fc3896 100644 --- a/diffulex/sampler/sdar.py +++ b/diffulex/sampler/sdar.py @@ -59,19 +59,15 @@ def forward(self, seqs: list[SequenceBase], logits: torch.Tensor, temperatures: if len(high_conf_indices) == 0: max_prob_idx = initial_confidence.argmax() - accepted_ids = torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long) + accepted_ids = max_prob_idx.view(1) else: max_prob_idx = initial_confidence.argmax() - accepted_ids = torch.unique(torch.cat([ - high_conf_indices, - torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long) - ])) + accepted_ids = torch.unique(torch.cat([high_conf_indices, max_prob_idx.view(1)])) - true_local_ids_sub_map[str(block_id)] = [ - block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist() - ] - accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist() - sampled_tokens_sub_map[str(block_id)] = sampled_tokens + accepted_ids_list = accepted_ids.to(device="cpu").tolist() + true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list] + accepted_ids_sub_map[str(block_id)] = accepted_ids_list + sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist() seq_idx = str(seq.seq_id) true_local_ids_map[seq_idx] = true_local_ids_sub_map diff --git a/diffulex/strategy/d2f/engine/scheduler.py b/diffulex/strategy/d2f/engine/scheduler.py index a4b8f29..d362dda 100644 --- a/diffulex/strategy/d2f/engine/scheduler.py +++ b/diffulex/strategy/d2f/engine/scheduler.py @@ -5,6 +5,8 @@ from diffulex.engine.sequence import SequenceStatus from .sequence import D2FSequence +import torch + @AutoScheduler.register("d2f", is_default=True) class D2FScheduler(SchedulerBase): @@ -104,12 +106,17 @@ def postprocess( continue diffusion_block = seq.diffusion_blocks[int(block_id)] sampled_tokens = sampled_tokens_map.get(block_id, []) + # `sampled_tokens` may be a CUDA Tensor (legacy behavior) or list[int]. + # Converting per-token via `.item()` causes massive DtoH sync overhead. + # Convert once per block. + if isinstance(sampled_tokens, torch.Tensor): + sampled_tokens = sampled_tokens.tolist() true_local_ids = true_ids_map.get(block_id, []) for true_local_id, accepted_id in zip(true_local_ids, accepted_ids): - token = sampled_tokens[accepted_id] + token = int(sampled_tokens[accepted_id]) diffusion_block.modify_token(true_local_id, token) if ( - (not seq.ignore_eos and token.item() == self.eos) + (not seq.ignore_eos and token == self.eos) or seq.num_completion_tokens >= seq.max_tokens ): seq.meet_eos = True diff --git a/diffulex/strategy/d2f/engine/sequence.py b/diffulex/strategy/d2f/engine/sequence.py index db22bc8..7532ea8 100644 --- a/diffulex/strategy/d2f/engine/sequence.py +++ b/diffulex/strategy/d2f/engine/sequence.py @@ -117,7 +117,9 @@ def modify_token(self, local_token_id: int, modified_to: int) -> None: raise RuntimeError("Diffusion block is not attached to a sequence.") target_id = local_token_id + self.global_start_id assert self.seq.token_ids[target_id] == self.mask_token_id - self.seq.token_ids[target_id] = modified_to.item() # type: ignore[assignment] + # Hot path: avoid per-token CUDA -> CPU sync via Tensor.item(). + # `modified_to` should be a python int (or at least int-castable). + self.seq.token_ids[target_id] = int(modified_to) # type: ignore[assignment] self.seq.new_tokens += 1 diff --git a/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py index be9389f..bb19518 100644 --- a/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py @@ -21,12 +21,14 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore apply_awq_marlin_linear, marlin_make_empty_g_idx, + should_use_atomic_add_reduce, marlin_permute_bias, ) from vllm.scalar_type import scalar_types # type: ignore except Exception: # pragma: no cover apply_awq_marlin_linear = None # type: ignore marlin_make_empty_g_idx = None # type: ignore + should_use_atomic_add_reduce = None # type: ignore marlin_permute_bias = None # type: ignore scalar_types = None # type: ignore @@ -37,6 +39,13 @@ def _build_linear_awq_marlin_w4a16() -> LinearQuantizationStrategy: class LinearAWQMarlinW4A16Strategy(LinearQuantizationStrategy): + def __init__(self) -> None: + super().__init__() + self._available: bool = bool(apply_awq_marlin_linear is not None and scalar_types is not None) + self._empty_cache: dict[int, torch.Tensor] = {} + self._bias_cache: dict[tuple[int, int], torch.Tensor] = {} + self._atomic_add_cache: dict[tuple[int, int, int, int, int], bool] = {} + @property def name(self) -> str: return "linear_awq_marlin_w4a16" @@ -75,49 +84,83 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) def linear_forward( self, x: torch.Tensor, - weight: torch.Tensor, + weight: Optional[torch.Tensor], bias: Optional[torch.Tensor], *, quant_kind: str, - **kwargs: Any, + qweight: torch.Tensor, + scales: torch.Tensor, + zp: torch.Tensor, + workspace: Optional[torch.Tensor] = None, + in_features: int = 0, + out_features: int = 0, + group_size: int = 128, + tp_dim: Optional[int] = None, ) -> torch.Tensor: - _ = quant_kind, weight - if apply_awq_marlin_linear is None or scalar_types is None: + _ = quant_kind, weight, group_size, tp_dim + if not self._available or workspace is None: raise RuntimeError("awq_marlin 需要 vLLM (marlin_utils + scalar_types);当前环境不可用。") + if in_features <= 0 or out_features <= 0: + raise RuntimeError("awq_marlin: missing in_features/out_features.") - qweight = kwargs.get("awq_marlin_qweight", None) - scales = kwargs.get("awq_marlin_scales", None) - zp = kwargs.get("awq_marlin_zp", None) - workspace = kwargs.get("awq_marlin_workspace", None) - in_features = int(kwargs.get("in_features", 0)) - out_features = int(kwargs.get("out_features", 0)) - - if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0: - raise RuntimeError("awq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).") - - # vLLM marlin kernels expect FP16 activations. - x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x + device = x.device + dev_key = int(device.index) if device.type == "cuda" and device.index is not None else -1 - # AWQ marlin does not use g_idx. - empty = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32) + # AWQ marlin does not use g_idx/perm; pass empty tensors (cached). + empty = self._empty_cache.get(dev_key) + if empty is None: + empty = marlin_make_empty_g_idx(device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=device, dtype=torch.int32) + self._empty_cache[dev_key] = empty + # Cache permuted bias. marlin_bias = None if bias is not None: - marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias - - out = apply_awq_marlin_linear( - input=x_in, - weight=qweight, - weight_scale=scales, - weight_zp=zp, - g_idx=empty, - g_idx_sort_indices=empty, - workspace=workspace, - quant_type=scalar_types.uint4, - output_size_per_partition=out_features, - input_size_per_partition=in_features, - bias=marlin_bias, - input_dtype=None, + bkey = (dev_key, int(bias.data_ptr())) + marlin_bias = self._bias_cache.get(bkey) + if marlin_bias is None: + marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias + self._bias_cache[bkey] = marlin_bias + + reshaped_x = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (int(out_features),) + m = int(reshaped_x.shape[0]) + n = int(out_features) + k = int(reshaped_x.shape[1]) + dtype_id = 1 if reshaped_x.dtype == torch.bfloat16 else (2 if reshaped_x.dtype == torch.float16 else 0) + use_atomic_add = False + if should_use_atomic_add_reduce is not None: + akey = (dev_key, dtype_id, m, n, k) + cached = self._atomic_add_cache.get(akey) + if cached is None: + cached = bool( + should_use_atomic_add_reduce( + m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype + ) + ) + self._atomic_add_cache[akey] = cached + use_atomic_add = cached + + out = torch.ops._C.gptq_marlin_gemm( + reshaped_x, + None, + qweight, + marlin_bias, + scales, + None, + None, + zp, + empty, + empty, + workspace, + scalar_types.uint4.id, + m, + n, + k, + True, # is_k_full + use_atomic_add, + True, # use_fp32_reduce + False, # is_zp_float ) + out = out.reshape(out_shape) return out.to(dtype=x.dtype) if out.dtype != x.dtype else out diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py index 488176e..22295fa 100644 --- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py @@ -30,6 +30,10 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy: class LinearAWQW4A16Strategy(LinearQuantizationStrategy): + def __init__(self) -> None: + super().__init__() + self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm")) + @property def name(self) -> str: return "linear_awq_w4a16" @@ -73,47 +77,47 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) def linear_forward( self, x: torch.Tensor, - weight: torch.Tensor, + weight: Optional[torch.Tensor], bias: Optional[torch.Tensor], *, quant_kind: str, - **kwargs: Any, + awq_qweight: Optional[torch.Tensor] = None, + awq_qzeros: Optional[torch.Tensor] = None, + awq_scales: Optional[torch.Tensor] = None, + pack_factor: int = 8, + out_features: Optional[int] = None, + in_features: Optional[int] = None, + group_size: int = 128, ) -> torch.Tensor: - _ = quant_kind, weight - if ops is None: + _ = quant_kind, weight, pack_factor, in_features, group_size + if not self._ops_available: raise RuntimeError( "vLLM is required for AWQ W4A16 (missing `vllm._custom_ops`). " "Please install/build vLLM with CUDA ops." ) - - qweight = kwargs.get("awq_qweight", None) - qzeros = kwargs.get("awq_qzeros", None) - scales = kwargs.get("awq_scales", None) - + qweight = awq_qweight + qzeros = awq_qzeros + scales = awq_scales if qweight is None or qzeros is None or scales is None: + if weight is None: + raise RuntimeError("AWQ offline weights missing packed tensors and bf16 weight is not present.") return F.linear(x, weight, bias) - # Infer pack_factor from packed shapes to avoid hard-coding 4-bit. - # AWQ: qweight [K, N/pack], scales [K/group, N] - if scales.ndim != 2 or scales.shape[1] <= 0: - raise RuntimeError(f"Invalid AWQ scales shape: {tuple(scales.shape)}") - if qweight.shape[1] <= 0 or int(scales.shape[1]) % int(qweight.shape[1]) != 0: - raise RuntimeError( - f"Invalid AWQ packed shapes: qweight.shape={tuple(qweight.shape)}, " - f"scales.shape={tuple(scales.shape)}" - ) - pack_factor = int(scales.shape[1]) // int(qweight.shape[1]) # vLLM AWQ kernels expect FP16 activations. - x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x - qweight = qweight.to(device=x.device, dtype=torch.int32) - qzeros = qzeros.to(device=x.device, dtype=torch.int32) - scales = scales.to(device=x.device, dtype=torch.float16) + x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16) - out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,) + # Use known out_features if provided (avoid per-call inference). + n = int(out_features) if out_features is not None else int(scales.shape[1]) + out_shape = x.shape[:-1] + (n,) reshaped_x = x_in.reshape(-1, x_in.shape[-1]) # Always use awq_gemm to avoid large temporary dequantized weight allocations. - out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, pack_factor) + # vLLM API: awq_gemm(input, qweight, qzeros, scales, split_k_iters) + split_k_iters = 1 + if reshaped_x.is_contiguous() and qweight.is_contiguous() and qzeros.is_contiguous() and scales.is_contiguous(): + out = torch.ops._C.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters) + else: + out = ops.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters) if bias is not None: out.add_(bias.to(dtype=out.dtype)) diff --git a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py index c544166..1425c85 100644 --- a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py @@ -27,6 +27,7 @@ apply_gptq_marlin_linear, marlin_is_k_full, marlin_make_empty_g_idx, + should_use_atomic_add_reduce, marlin_permute_bias, ) from vllm.scalar_type import scalar_types # type: ignore @@ -34,6 +35,7 @@ apply_gptq_marlin_linear = None # type: ignore marlin_is_k_full = None # type: ignore marlin_make_empty_g_idx = None # type: ignore + should_use_atomic_add_reduce = None # type: ignore marlin_permute_bias = None # type: ignore scalar_types = None # type: ignore @@ -44,6 +46,13 @@ def _build_linear_gptq_marlin_w4a16() -> LinearQuantizationStrategy: class LinearGPTQMarlinW4A16Strategy(LinearQuantizationStrategy): + def __init__(self) -> None: + super().__init__() + self._available: bool = bool(apply_gptq_marlin_linear is not None and scalar_types is not None) + self._empty_cache: dict[int, torch.Tensor] = {} + self._bias_cache: dict[tuple[int, int], torch.Tensor] = {} + self._atomic_add_cache: dict[tuple[int, int, int, int, int], bool] = {} + @property def name(self) -> str: return "linear_gptq_marlin_w4a16" @@ -82,28 +91,28 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) def linear_forward( self, x: torch.Tensor, - weight: torch.Tensor, + weight: Optional[torch.Tensor], bias: Optional[torch.Tensor], *, quant_kind: str, - **kwargs: Any, + qweight: torch.Tensor, + scales: torch.Tensor, + zp: torch.Tensor, + g_idx: Optional[torch.Tensor] = None, + g_idx_sort_indices: Optional[torch.Tensor] = None, + workspace: Optional[torch.Tensor] = None, + in_features: int = 0, + out_features: int = 0, + group_size: int = 128, + weight_bits: int = 0, + tp_dim: Optional[int] = None, ) -> torch.Tensor: - _ = quant_kind, weight - if apply_gptq_marlin_linear is None or scalar_types is None: + _ = quant_kind, weight, group_size + if not self._available or workspace is None: raise RuntimeError("gptq_marlin 需要 vLLM (marlin_utils + scalar_types);当前环境不可用。") - qweight = kwargs.get("gptq_marlin_qweight", None) - scales = kwargs.get("gptq_marlin_scales", None) - zp = kwargs.get("gptq_marlin_zp", None) - g_idx = kwargs.get("gptq_marlin_g_idx", None) - g_idx_sort_indices = kwargs.get("gptq_marlin_g_idx_sort_indices", None) - workspace = kwargs.get("gptq_marlin_workspace", None) - in_features = int(kwargs.get("in_features", 0)) - out_features = int(kwargs.get("out_features", 0)) - weight_bits = int(kwargs.get("gptq_weight_bits", 0)) - - if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0: - raise RuntimeError("gptq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).") + if in_features <= 0 or out_features <= 0: + raise RuntimeError("gptq_marlin: missing in_features/out_features.") if weight_bits == 4: wtype = scalar_types.uint4b8 @@ -112,45 +121,84 @@ def linear_forward( else: raise RuntimeError(f"gptq_marlin: unsupported weight_bits={weight_bits} (expected 4 or 8)") - # Align with vLLM Marlin: accept bf16/fp16 activations directly. - x_in = x + device = x.device + dev_key = int(device.index) if device.type == "cuda" and device.index is not None else -1 - # g_idx can be empty (desc_act=False). Ensure correct dtype/device. - if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): - g_idx_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32) + # g_idx can be empty (desc_act=False). Prefer already-correct tensors; avoid per-call to(). + if g_idx is None or g_idx.numel() == 0: + empty = self._empty_cache.get(dev_key) + if empty is None: + empty = marlin_make_empty_g_idx(device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=device, dtype=torch.int32) + self._empty_cache[dev_key] = empty + g_idx_t = empty else: - g_idx_t = g_idx.to(device=x.device, dtype=torch.int32) - if g_idx_sort_indices is None or (isinstance(g_idx_sort_indices, torch.Tensor) and g_idx_sort_indices.numel() == 0): - g_idx_sort_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32) + g_idx_t = g_idx + if g_idx_sort_indices is None or g_idx_sort_indices.numel() == 0: + empty = self._empty_cache.get(dev_key) + if empty is None: + empty = marlin_make_empty_g_idx(device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=device, dtype=torch.int32) + self._empty_cache[dev_key] = empty + g_idx_sort_t = empty else: - g_idx_sort_t = g_idx_sort_indices.to(device=x.device, dtype=torch.int32) + g_idx_sort_t = g_idx_sort_indices # Determine whether K is full (needed by marlin kernel). Row-parallel layers set tp_dim=1 in Diffulex. - row_parallel = bool(kwargs.get("tp_dim", None) == 1) + row_parallel = bool(tp_dim == 1) has_g_idx = bool(g_idx_t.numel() > 0) - if marlin_is_k_full is None: - is_k_full = True - else: - is_k_full = marlin_is_k_full(has_g_idx, row_parallel) + is_k_full = True if marlin_is_k_full is None else marlin_is_k_full(has_g_idx, row_parallel) + # Cache permuted bias (Marlin expects permuted bias order). marlin_bias = None if bias is not None: - marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias - - out = apply_gptq_marlin_linear( - input=x_in, - weight=qweight, - weight_scale=scales, - weight_zp=zp, - g_idx=g_idx_t, - g_idx_sort_indices=g_idx_sort_t, - workspace=workspace, - wtype=wtype, - output_size_per_partition=out_features, - input_size_per_partition=in_features, - is_k_full=is_k_full, - bias=marlin_bias, - input_dtype=None, + bkey = (dev_key, int(bias.data_ptr())) + marlin_bias = self._bias_cache.get(bkey) + if marlin_bias is None: + marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias + self._bias_cache[bkey] = marlin_bias + + # Flatten like F.linear: [*,K] -> [M,K] + reshaped_x = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (int(out_features),) + + # Cache heuristic for atomic-add reduction (depends on M/N/K, device, dtype). + m = int(reshaped_x.shape[0]) + n = int(out_features) + k = int(reshaped_x.shape[1]) + dtype_id = 1 if reshaped_x.dtype == torch.bfloat16 else (2 if reshaped_x.dtype == torch.float16 else 0) + use_atomic_add = False + if should_use_atomic_add_reduce is not None: + akey = (dev_key, dtype_id, m, n, k) + cached = self._atomic_add_cache.get(akey) + if cached is None: + cached = bool( + should_use_atomic_add_reduce( + m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype + ) + ) + self._atomic_add_cache[akey] = cached + use_atomic_add = cached + + # Directly call the underlying CUDA op to minimize Python glue. + out = torch.ops._C.gptq_marlin_gemm( + reshaped_x, + None, + qweight, + marlin_bias, + scales, + None, + None, + zp, + g_idx_t, + g_idx_sort_t, + workspace, + wtype.id, + m, + n, + k, + is_k_full, + use_atomic_add, + True, # use_fp32_reduce + False, # is_zp_float ) - return out + return out.reshape(out_shape) diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py index 8fc67a5..f0a7a98 100644 --- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py @@ -34,6 +34,10 @@ def _build_linear_gptq_w4a16() -> LinearQuantizationStrategy: class LinearGPTQW4A16Strategy(LinearQuantizationStrategy): + def __init__(self) -> None: + super().__init__() + self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_gemm")) + @property def name(self) -> str: return "linear_gptq_w4a16" @@ -77,67 +81,92 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) def linear_forward( self, x: torch.Tensor, - weight: torch.Tensor, + weight: Optional[torch.Tensor], bias: Optional[torch.Tensor], *, quant_kind: str, - **kwargs: Any, + gptq_qweight: Optional[torch.Tensor] = None, + gptq_qzeros: Optional[torch.Tensor] = None, + gptq_scales: Optional[torch.Tensor] = None, + gptq_g_idx: Optional[torch.Tensor] = None, + weight_bits: int = 0, + use_v2_format: bool = False, + out_features: Optional[int] = None, + in_features: Optional[int] = None, + group_size: int = 128, ) -> torch.Tensor: - _ = quant_kind, weight - if ops is None: + _ = quant_kind, weight, in_features, group_size + if not self._ops_available: raise RuntimeError( "vLLM is required for GPTQ W4A16 (missing `vllm._custom_ops`). " "Please install/build vLLM with CUDA ops." ) - - qweight = kwargs.get("gptq_qweight", None) - qzeros = kwargs.get("gptq_qzeros", None) - scales = kwargs.get("gptq_scales", None) - g_idx = kwargs.get("gptq_g_idx", None) + qweight = gptq_qweight + qzeros = gptq_qzeros + scales = gptq_scales + g_idx = gptq_g_idx if qweight is None or qzeros is None or scales is None: + # correctness fallback (should not happen for offline GPTQ weights) + if weight is None: + raise RuntimeError("GPTQ offline weights missing packed tensors and bf16 weight is not present.") return F.linear(x, weight, bias) - use_v2_format = bool(kwargs.get("gptq_use_v2_format", False)) - - # Infer weight_bits from packed shapes to support GPTQ W2/W4/W8. - # qzeros: [K/group, N/pack_factor] and qweight: [K/pack_factor, N] - if qzeros.shape[1] <= 0 or qweight.shape[1] % int(qzeros.shape[1]) != 0: - raise RuntimeError( - f"Invalid GPTQ packed shapes: qweight.shape={tuple(qweight.shape)}, " - f"qzeros.shape={tuple(qzeros.shape)}" - ) - pack_factor = int(qweight.shape[1]) // int(qzeros.shape[1]) - if 32 % pack_factor != 0: - raise RuntimeError( - f"Unsupported GPTQ pack_factor={pack_factor} (requires 32%pack_factor==0). " - f"qweight.shape={tuple(qweight.shape)}, qzeros.shape={tuple(qzeros.shape)}" - ) - weight_bits = 32 // pack_factor - # vLLM GPTQ kernels expect FP16 activations. - x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x - qweight = qweight.to(device=x.device, dtype=torch.int32) - qzeros = qzeros.to(device=x.device, dtype=torch.int32) - scales = scales.to(device=x.device, dtype=torch.float16) + x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16) + + # ---- Fast path ---- + if ( + x_in.dim() == 2 + and x_in.is_contiguous() + and qweight.device == x.device + and qzeros.device == x.device + and scales.device == x.device + and qweight.dtype == torch.int32 + and qzeros.dtype == torch.int32 + and scales.dtype == torch.float16 + and qweight.is_contiguous() + and qzeros.is_contiguous() + and scales.is_contiguous() + and weight_bits > 0 + ): + if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): + g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int) + else: + # Prefer already-correct dtype/device to avoid per-call copies. + g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int) + n = int(out_features) if out_features is not None else int(qweight.shape[-1]) + output = torch.ops._C.gptq_gemm( + x_in, + qweight, + qzeros, + scales, + g_idx_t, + True, + bool(use_v2_format), + int(weight_bits), + ) + if bias is not None: + output.add_(bias.to(dtype=output.dtype)) + # Output is [M,N] + return output.to(dtype=x.dtype) if output.dtype != x.dtype else output + out_shape = x.shape[:-1] + (int(out_features) if out_features is not None else int(qweight.shape[-1]),) + reshaped_x = x_in.reshape(-1, x_in.shape[-1]) if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int) else: g_idx_t = g_idx.to(device=x.device, dtype=torch.int) - out_shape = x.shape[:-1] + (qweight.shape[-1],) - reshaped_x = x_in.reshape(-1, x_in.shape[-1]) - output = ops.gptq_gemm( reshaped_x, qweight, qzeros, scales, g_idx_t, - True, # use_exllama (vLLM shuffles weights into exllama-friendly layout) - use_v2_format, - weight_bits, + True, # use_exllama + bool(use_v2_format), + int(weight_bits) if weight_bits > 0 else 4, ) if bias is not None: output.add_(bias.to(dtype=output.dtype)) diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py index 52e92ed..ae62b64 100644 --- a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py +++ b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py @@ -22,14 +22,10 @@ from diffulex.utils.quantization.strategy import LinearQuantizationStrategy -def _require_vllm_ops(): - try: - from vllm import _custom_ops as ops # type: ignore - except Exception as e: # pragma: no cover - raise RuntimeError( - "W8A8 需要 vLLM 的 CUDA 自定义算子(vllm._custom_ops)。" - ) from e - return ops +try: + from vllm import _custom_ops as _vllm_ops # type: ignore +except Exception: # pragma: no cover + _vllm_ops = None # type: ignore @register_linear_strategy(weight_dtype="int8", act_dtype="int8") @@ -42,6 +38,12 @@ def __init__(self) -> None: super().__init__() # Cache: id(weight) -> (qweight_int8 [N,K], w_scales_fp32 [N]) self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} + self._ops_available: bool = bool( + _vllm_ops is not None + and hasattr(torch.ops, "_C") + and hasattr(torch.ops._C, "dynamic_scaled_int8_quant") + and hasattr(torch.ops._C, "cutlass_scaled_mm") + ) @property def name(self) -> str: @@ -109,18 +111,48 @@ def linear_forward( bias: Optional[torch.Tensor], *, quant_kind: str, - **kwargs: Any, + quant_scales: Optional[torch.Tensor] = None, + out_features: Optional[int] = None, ) -> torch.Tensor: _ = quant_kind - ops = _require_vllm_ops() + # ---- Fast path (decode hot path) ---- + # Preconditions are strict to minimize Python overhead. + # Expect: + # - qweight: int8 KxN with stride(0)==1 + # - w_scales: float32 [1,N], contiguous + if ( + self._ops_available + and _vllm_ops is not None + and x.dim() == 2 + and x.device.type == "cuda" + and x.dtype in (torch.bfloat16, torch.float16) + and x.is_contiguous() + and weight is not None + and weight.dtype == torch.int8 + and weight.device == x.device + and weight.stride(0) == 1 + and quant_scales is not None + and quant_scales.device == x.device + and quant_scales.dtype == torch.float32 + and quant_scales.dim() == 2 + and quant_scales.is_contiguous() + ): + m, _k = x.shape + # Optionally validate N to catch wrong metadata early. + if out_features is None or int(out_features) == int(quant_scales.shape[1]): + x_q = torch.empty((m, _k), device=x.device, dtype=torch.int8) + x_s = torch.empty((m, 1), device=x.device, dtype=torch.float32) + torch.ops._C.dynamic_scaled_int8_quant(x_q, x, x_s, None) + out = torch.empty((m, int(quant_scales.shape[1])), device=x.device, dtype=x.dtype) + torch.ops._C.cutlass_scaled_mm(out, x_q, weight, x_s, quant_scales, bias) + return out # If weight already quantized by LinearBase.load-time quantization. - quant_scales = kwargs.get("quant_scales", None) if weight is not None and weight.dtype == torch.int8 and quant_scales is not None: - # Expected: qweight is K×N int8, quant_scales is [1,N] fp32 - qweight = weight.to(device=x.device) - w_scales = quant_scales.to(device=x.device, dtype=torch.float32) + # Expected: qweight is K×N int8 (may be non-contiguous), quant_scales is [1,N] fp32 + qweight = weight + w_scales = quant_scales.to(dtype=torch.float32) else: wid = id(weight) cached = self._weight_cache.get(wid) @@ -138,8 +170,8 @@ def linear_forward( if x2.dtype not in (torch.bfloat16, torch.float16): x2 = x2.to(torch.bfloat16) # dynamic per-token int8 quant + fused GEMM_DQ - x_q, x_s, _ = ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True) - y = ops.cutlass_scaled_mm( + x_q, x_s, _ = _vllm_ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True) + y = _vllm_ops.cutlass_scaled_mm( x_q, qweight, scale_a=x_s, diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py index 1cd8eb1..fe99904 100644 --- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py +++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py @@ -39,7 +39,9 @@ def _allspark_is_available() -> bool: def _allspark_w8a16_gemm(*args, **kwargs): if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_w8a16_gemm"): raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_w8a16_gemm`.") - return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs) + # Narrow profiler range to isolate Python wrapper overhead vs kernel time. + with torch.profiler.record_function("w8a16/allspark_w8a16_gemm(pybind)"): + return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs) def _allspark_repack_weight(b_qweight_kn: torch.Tensor, scales_1xn: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: @@ -67,6 +69,11 @@ def __init__(self) -> None: super().__init__() # Cache for bf16 Parameters only (load-time quantized path bypasses this). self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} + # Cache device info and thresholds to reduce per-call CPU overhead. + self._sm_info_cache: dict[int, tuple[int, int]] = {} + self._cublas_m_thr: int = self._cublas_m_threshold() + # One-time availability check (avoid calling `_allspark_is_available()` on every linear). + self._allspark_available: bool = _allspark_is_available() @property def name(self) -> str: @@ -166,8 +173,7 @@ def quantize_weight_for_kernel( block_n = 256 block_n = max(1, block_n) - use_allspark = _allspark_is_available() - if use_allspark: + if self._allspark_available: # AllSpark repack expects B in (K,N) contiguous layout. b_kn = torch.empty((k, n), device=weight.device, dtype=torch.uint8) # [K,N] for i in range(0, n, block_n): @@ -219,12 +225,22 @@ def quantize_act_for_kernel( return x, None def _get_sm_info(self, device: torch.device) -> tuple[int, int]: + # get_device_properties is relatively expensive on hot paths; cache per device index. + try: + idx = int(device.index) if device.index is not None else int(torch.cuda.current_device()) + except Exception: + idx = -1 + cached = self._sm_info_cache.get(idx) + if cached is not None: + return cached try: props = torch.cuda.get_device_properties(device) sm_count = int(getattr(props, "multi_processor_count", 0)) sm_version = int(props.major) * 10 + int(props.minor) + self._sm_info_cache[idx] = (sm_count, sm_version) return sm_count, sm_version except Exception: + self._sm_info_cache[idx] = (0, 0) return 0, 0 def _cublas_m_threshold(self) -> int: @@ -242,39 +258,85 @@ def linear_forward( bias: Optional[torch.Tensor], *, quant_kind: str, - **kwargs: Any, + quant_scales: Optional[torch.Tensor] = None, + out_features: Optional[int] = None, ) -> torch.Tensor: _ = quant_kind + # ---- Fast path (decode hot path) ---- + # Goal: make Python-side overhead close to a single custom-op call (+ optional bias add). + # Preconditions are intentionally strict; otherwise we fall back to the fully-checked path. + # + # Notes: + # - We call `_vllm_ops.allspark_w8a16_gemm` directly to avoid extra Python wrapper overhead. + # - We require `quant_scales` already in 1xN contiguous layout (LinearBase provides this). + if ( + self._allspark_available + and _vllm_ops is not None + and x.dim() == 2 + and x.device.type == "cuda" + and x.dtype == torch.bfloat16 + and x.is_contiguous() + and weight is not None + and weight.dtype in (torch.uint8, torch.int8) + and weight.is_contiguous() + and quant_scales is not None + and quant_scales.dim() == 2 + and quant_scales.is_contiguous() + and out_features is not None + ): + # Minimal shape checks (avoid slow/branchy fallback). + m, k = x.shape + n_32, k_w = weight.shape + if k_w == k and (k & 15) == 0 and 0 < int(out_features) <= int(n_32): + sm_count, sm_version = self._get_sm_info(x.device) + y = _vllm_ops.allspark_w8a16_gemm( + x, + weight, + quant_scales, + None, # b_qzeros + int(out_features), + -1, # group_size (only supports -1) + sm_count, + sm_version, + self._cublas_m_thr, + False, # has_zp + True, # n32k16_reorder + ) + if bias is not None: + y = y + bias + return y + # Handle >2D like torch.nn.functional.linear: flatten then reshape back. - orig_shape = x.shape - if x.dim() == 1: - x2 = x.unsqueeze(0) - elif x.dim() == 2: - x2 = x - else: - x2 = x.reshape(-1, x.shape[-1]) + with torch.profiler.record_function("w8a16/reshape_input"): + orig_shape = x.shape + if x.dim() == 1: + x2 = x.unsqueeze(0) + elif x.dim() == 2: + x2 = x + else: + x2 = x.reshape(-1, x.shape[-1]) # Load-time quantized module path: weight is uint8/int8 buffer and scales provided. - quant_scales = kwargs.pop("quant_scales", None) - if weight is not None and weight.dtype in (torch.uint8, torch.int8): - if quant_scales is None: - raise ValueError("quant_scales is required when weight is quantized") - qweight = weight - scales = quant_scales - else: - # Lazy cache for bf16 weights (not expected in steady-state, but keep for safety). - weight_id = id(weight) - cached = self._weight_cache.get(weight_id) - if cached is None or cached[0].device != x2.device: - qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device) - self._weight_cache[weight_id] = (qweight, scales) + with torch.profiler.record_function("w8a16/select_qweight_scales"): + if weight is not None and weight.dtype in (torch.uint8, torch.int8): + if quant_scales is None: + raise ValueError("quant_scales is required when weight is quantized") + qweight = weight + scales = quant_scales else: - qweight, scales = cached + # Lazy cache for bf16 weights (not expected in steady-state, but keep for safety). + weight_id = id(weight) + cached = self._weight_cache.get(weight_id) + if cached is None or cached[0].device != x2.device: + qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device) + self._weight_cache[weight_id] = (qweight, scales) + else: + qweight, scales = cached # If fused kernel isn't available, fall back to BF16 only if original weight exists; # otherwise fail fast (do NOT dequantize a full matrix, which is memory-prohibitive). - if not _allspark_is_available(): + if not self._allspark_available: if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16): return F.linear(x, weight, bias) raise RuntimeError( @@ -283,56 +345,75 @@ def linear_forward( ) # AllSpark kernel requires CUDA and contiguous inputs. - if x2.device.type != "cuda": - return self._fallback(x, weight, qweight, scales, bias) + with torch.profiler.record_function("w8a16/device_dtype_checks"): + if x2.device.type != "cuda": + return self._fallback(x, weight, qweight, scales, bias) - if x2.dtype != torch.bfloat16: - x2 = x2.to(dtype=torch.bfloat16) + if x2.dtype != torch.bfloat16: + x2 = x2.to(dtype=torch.bfloat16) # Shape checks: x2 [M,K], qweight [N_32align,K] - m, k = x2.shape - n_32, k_w = qweight.shape - if k_w != k: - return self._fallback(x, weight, qweight, scales, bias) - if k % 16 != 0: - return self._fallback(x, weight, qweight, scales, bias) + with torch.profiler.record_function("w8a16/shape_checks"): + m, k = x2.shape + n_32, k_w = qweight.shape + if k_w != k: + return self._fallback(x, weight, qweight, scales, bias) + if k % 16 != 0: + return self._fallback(x, weight, qweight, scales, bias) # Recover real N from module bias/metadata if available; default to n_32. # In Diffulex, LinearBase stores output_size; but strategy doesn't receive module. # So we infer N from bias if present else from scales length (can be N_32align). - n = int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32)) - if n <= 0 or n > n_32: - n = n_32 + with torch.profiler.record_function("w8a16/infer_n_and_sm"): + if out_features is not None: + n = int(out_features) + else: + # Backward compatible fallback. + n = int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32)) + if n <= 0 or n > n_32: + n = n_32 - sm_count, sm_version = self._get_sm_info(x2.device) - cublas_thr = self._cublas_m_threshold() + sm_count, sm_version = self._get_sm_info(x2.device) + cublas_thr = self._cublas_m_thr # vLLM allspark expects scales as 1xN (or equivalent contiguous view). - scales_1xn = scales.reshape(1, -1).contiguous() - y2 = _allspark_w8a16_gemm( - x2.contiguous(), - qweight.contiguous(), - scales_1xn, - None, # b_qzeros - n, - -1, # group_size (only supports -1) - sm_count, - sm_version, - cublas_thr, - False, # has_zp - True, # n32k16_reorder - ) - if bias is not None: - y2 = y2 + bias + # NOTE: reshape/view doesn't allocate; only materialize contiguous copies when needed. + with torch.profiler.record_function("w8a16/prepare_contiguous_and_scales"): + if not x2.is_contiguous(): + x2 = x2.contiguous() + # qweight/scales are made contiguous at load-time (`LinearBase.set_quantized_weight`) + # and by `quantize_weight_for_kernel` return values. + if scales.dim() == 2: + scales_1xn = scales + else: + scales_1xn = scales.view(1, -1) + + with torch.profiler.record_function("w8a16/call_fused_gemm"): + y2 = _allspark_w8a16_gemm( + x2, + qweight, + scales_1xn, + None, # b_qzeros + n, + -1, # group_size (only supports -1) + sm_count, + sm_version, + cublas_thr, + False, # has_zp + True, # n32k16_reorder + ) + if bias is not None: + y2 = y2 + bias # Reshape back - if x.dim() == 1: - y = y2.squeeze(0) - elif x.dim() == 2: - y = y2 - else: - y = y2.reshape(*orig_shape[:-1], y2.shape[-1]) - return y + with torch.profiler.record_function("w8a16/reshape_output"): + if x.dim() == 1: + y = y2.squeeze(0) + elif x.dim() == 2: + y = y2 + else: + y = y2.reshape(*orig_shape[:-1], y2.shape[-1]) + return y # NOTE: We intentionally do not provide a generic dequantize+F.linear fallback for reordered weights. # It materializes a full bf16 matrix and is prone to OOM on large models. diff --git a/diffulex_kernel/python/dllm_flash_attn_kernels.py b/diffulex_kernel/python/dllm_flash_attn_kernels.py index 8877c49..1535ab0 100644 --- a/diffulex_kernel/python/dllm_flash_attn_kernels.py +++ b/diffulex_kernel/python/dllm_flash_attn_kernels.py @@ -1,887 +1,152 @@ -import os -import torch -import tilelang -import tilelang.language as T - -from flash_attn import flash_attn_varlen_func -from tilelang.autotuner import set_autotune_inputs - -from diffulex_kernel.python.auto_tuner import build_configs -from diffulex_kernel.python.kv_cache_kernels import load_kvcache -from diffulex.attention.metadata import AttnMetaDataBase, is_warming_up -from test.python.utils.checker import CHECK_FLASH_ATTN_PREFILL, CHECK_FLASH_ATTN_DECODE - - -# from tilelang.engine.callback import register_cuda_postproc_callback -# @register_cuda_postproc_callback -# def tilelang_callback_cuda_postproc(code, _): -# code = "// tilelang_callback_cuda_postproc: generated CUDA code by TileLang\n" + code -# print(code) -# return code - - -kernel_config = None -kernel_config_bf16_q_fp8_kv_decode = None - - -@tilelang.autotune(configs=build_configs()) -@tilelang.jit( - # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper - out_idx=[-1], - pass_configs={ - tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, - tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, - tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, - } -) -def dllm_flash_attn_prefill_kernel( - NUM_SEQS: int, - NUM_GROUPS: int, - Q_LEN: int, - KV_LEN: int, - NUM_HEADS: int, - HEAD_DIM: int, - IS_BLOCK_ATTN: bool, - DIFFUSION_BLOCK_SIZE: int, - BLOCK_M: int = 64, - BLOCK_N: int = 64, - NUM_STAGES: int = 1, - NUM_THREADS: int = 128, -): - SCALE = (1.0 / HEAD_DIM)**0.5 * 1.44269504 # log2(e) - NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS - Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] - KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM] - O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] - DTYPE = "bfloat16" - ACCUM_DTYPE = "float" - - @T.prim_func - def kernel( - Q: T.Tensor(Q_SHAPE, DTYPE), - K: T.Tensor(KV_SHAPE, DTYPE), - V: T.Tensor(KV_SHAPE, DTYPE), - cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"), - cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"), - max_seqlen_q: T.int32, - O: T.Tensor(O_SHAPE, DTYPE), - ): - with T.Kernel(T.ceildiv(max_seqlen_q, BLOCK_M), NUM_HEADS, NUM_SEQS, threads=NUM_THREADS) as (bx, by, bz): - Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) - K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) - V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) - O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) - - acc_score = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE) - acc_score_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE) - acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE) - scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - - T.annotate_layout({ - Q_shared: tilelang.layout.make_swizzled_layout(Q_shared), - O_shared: tilelang.layout.make_swizzled_layout(O_shared), - }) - - q_block_idx = bx - seq_idx = bz - head_idx = by - kv_head_idx = head_idx // NUM_GROUPS - - q_start_idx = cu_seqlens_q[seq_idx] - kv_start_idx = cu_seqlens_k[seq_idx] - q_end_idx = cu_seqlens_q[seq_idx + 1] - kv_end_idx = cu_seqlens_k[seq_idx + 1] - - cur_q_seqlen = q_end_idx - q_start_idx - cur_kv_seqlen = kv_end_idx - kv_start_idx - - T.copy(Q[q_start_idx + q_block_idx * BLOCK_M : q_start_idx + (q_block_idx + 1) * BLOCK_M, head_idx, :], Q_shared) - - T.fill(acc_output, 0) - T.fill(acc_score, 0) - T.fill(log_sum, 0) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - - # The same boundary condition as naive causal mask - loop_range = ( - T.min(T.ceildiv(cur_q_seqlen + (q_block_idx + 1) * BLOCK_M, BLOCK_N), T.ceildiv(cur_kv_seqlen, BLOCK_N)) - if IS_BLOCK_ATTN else T.ceildiv(cur_kv_seqlen, BLOCK_N) - ) - for kv_block_idx in T.Pipelined(loop_range, num_stages=NUM_STAGES): - T.copy(K[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :], K_shared) - - # Initialize acc_score with mask - if IS_BLOCK_ATTN: - for i, j in T.Parallel(BLOCK_M, BLOCK_N): - num_diffusion_blocks = (q_block_idx * BLOCK_M + i) // DIFFUSION_BLOCK_SIZE + 1 - acc_score[i, j] = T.if_then_else( - (num_diffusion_blocks * DIFFUSION_BLOCK_SIZE <= kv_block_idx * BLOCK_N + j) or - (q_block_idx * BLOCK_M + i >= cur_q_seqlen or - kv_block_idx * BLOCK_N + j >= cur_kv_seqlen), -1e9, 0 - ) - else: - for i, j in T.Parallel(BLOCK_M, BLOCK_N): - acc_score[i, j] = T.if_then_else( - (q_block_idx * BLOCK_M + i >= cur_q_seqlen or - kv_block_idx * BLOCK_N + j >= cur_kv_seqlen), -1e9, 0 - ) - - # Compute attention scores - T.gemm(Q_shared, K_shared, acc_score, transpose_B=True, policy=T.GemmWarpPolicy.FullRow) - - # Compute online softmax - T.copy(scores_max, scores_max_prev) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - T.reduce_max(acc_score, scores_max, dim=1, clear=False) # T.reduce_max(acc_score, scores_max, dim=1, clear=True) # TODO: check if this is correct - for i in T.Parallel(BLOCK_M): - scores_max[i] = T.max(scores_max[i], scores_max_prev[i]) - - for i in T.parallel(BLOCK_M): - scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE) - - for i, j in T.Parallel(BLOCK_M, BLOCK_N): - acc_score[i, j] = T.exp2(acc_score[i, j] * SCALE - scores_max[i] * SCALE) - - T.reduce_sum(acc_score, scores_sum, dim=1) - for i in T.Parallel(BLOCK_M): - log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i] - - T.copy(acc_score, acc_score_cast) - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] *= scores_scale[i] - - # Compute attention output - T.copy(V[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :], V_shared) - T.gemm(acc_score_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow) - - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] /= log_sum[i] - - T.copy(acc_output, O_shared) - for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM): - if i + q_block_idx * BLOCK_M < cur_q_seqlen: - O[i + q_start_idx + q_block_idx * BLOCK_M, head_idx, d_idx] = O_shared[i, d_idx] - - return kernel - - -@tilelang.jit( - # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper - out_idx=[-1], - pass_configs={ - tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, - tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, - tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, - # tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE: True, - # tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "txt,pdf" - } -) -def dllm_flash_attn_decode_kernel( - NUM_SEQS: int, - NUM_GROUPS: int, - NUM_PAGE_BLOCKS: int, - Q_LEN: int, - KV_LEN: int, - NUM_HEADS: int, - HEAD_DIM: int, - IS_BLOCK_ATTN: bool, - DIFFUSION_BLOCK_SIZE: int, - MAX_SEQ_NUM_BLOCKS: int, - PAGE_BLOCK_SIZE: int = 32, - BLOCK_M: int = 64, - BLOCK_N: int = 64, - NUM_STAGES: int = 1, - NUM_THREADS: int = 128, -): - SCALE = (1.0 / HEAD_DIM)**0.5 * 1.44269504 # log2(e) - NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS - Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] - KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM] - O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] - K_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM] - V_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM] - MAX_SEQ_NUM_BLOCKS = T.dynamic("MAX_SEQ_NUM_BLOCKS", 'int32') - BLOCK_TABLES_SHAPE = [NUM_SEQS, MAX_SEQ_NUM_BLOCKS] - DTYPE = "bfloat16" - ACCUM_DTYPE = "float32" - - @T.prim_func - def kernel( - Q: T.Tensor(Q_SHAPE, DTYPE), - K: T.Tensor(KV_SHAPE, DTYPE), - V: T.Tensor(KV_SHAPE, DTYPE), - K_Cache: T.Tensor(K_CACHE_SHAPE, DTYPE), - V_Cache: T.Tensor(V_CACHE_SHAPE, DTYPE), - block_tables: T.Tensor(BLOCK_TABLES_SHAPE, "int32"), - context_lens: T.Tensor(NUM_SEQS, "int32"), - cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"), - cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"), - max_seqlen_q: T.int32, - O: T.Tensor(O_SHAPE, DTYPE), - ): - with T.Kernel(NUM_SEQS, NUM_HEADS, threads=NUM_THREADS) as (bx, by): - Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) - K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) - V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) - O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) - K_Cache_shared = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE) - V_Cache_shared = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE) - - acc_score_kv = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE) - acc_score_kv_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE) - acc_score_kvcache = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], ACCUM_DTYPE) - acc_score_kvcache_cast = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], DTYPE) - - acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE) - scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - - T.annotate_layout({ - Q_shared: tilelang.layout.make_swizzled_layout(Q_shared), - O_shared: tilelang.layout.make_swizzled_layout(O_shared), - }) - - seq_idx = bx - head_idx = by - kv_head_idx = head_idx // NUM_GROUPS - - q_start_idx = cu_seqlens_q[seq_idx] - kv_start_idx = cu_seqlens_k[seq_idx] - q_end_idx = cu_seqlens_q[seq_idx + 1] - kv_end_idx = cu_seqlens_k[seq_idx + 1] - - cur_q_seqlen = q_end_idx - q_start_idx - cur_kv_seqlen = kv_end_idx - kv_start_idx - - cur_context_len = context_lens[seq_idx] - - T.copy(Q[q_start_idx : q_start_idx + BLOCK_M, head_idx, :], Q_shared) - - T.fill(acc_output, 0) - T.fill(acc_score_kv, 0) - T.fill(acc_score_kvcache, 0) - T.fill(log_sum, 0) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - - # ========================== - # Stage 1: KV Cache Attention (Context) - # ========================== - for page_block_idx_local in T.Pipelined(MAX_SEQ_NUM_BLOCKS, num_stages=NUM_STAGES): - page_block_idx_global = block_tables[seq_idx, page_block_idx_local] - - if page_block_idx_global >= 0: - T.copy(K_Cache[page_block_idx_global, :, kv_head_idx, :], K_Cache_shared) - - for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE): - acc_score_kvcache[i, j] = T.if_then_else( - (i >= cur_q_seqlen or - page_block_idx_local * PAGE_BLOCK_SIZE + j >= cur_context_len), -1e9, 0 - ) - - # Compute attention scores - T.gemm(Q_shared, K_Cache_shared, acc_score_kvcache, transpose_B=True, policy=T.GemmWarpPolicy.FullRow) - - # Compute online softmax - T.copy(scores_max, scores_max_prev) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - T.reduce_max(acc_score_kvcache, scores_max, dim=1, clear=False) - for i in T.Parallel(BLOCK_M): - scores_max[i] = T.max(scores_max[i], scores_max_prev[i]) - - for i in T.Parallel(BLOCK_M): - scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE) - - for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE): - acc_score_kvcache[i, j] = T.exp2(acc_score_kvcache[i, j] * SCALE - scores_max[i] * SCALE) - - T.reduce_sum(acc_score_kvcache, scores_sum, dim=1) - for i in T.Parallel(BLOCK_M): - log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i] - - T.copy(acc_score_kvcache, acc_score_kvcache_cast) - - # Scale previous output accumulator - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] *= scores_scale[i] - - # Accumulate current V_cache contribution - T.copy(V_Cache[page_block_idx_global, :, kv_head_idx, :], V_Cache_shared) - T.gemm(acc_score_kvcache_cast, V_Cache_shared, acc_output, policy=T.GemmWarpPolicy.FullRow) - - - # ========================== - # Stage 2: Fresh KV Attention (Self-Attn) - # ========================== - for idx in T.Pipelined(T.ceildiv(DIFFUSION_BLOCK_SIZE, BLOCK_N), num_stages=NUM_STAGES): - T.copy(K[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], K_shared) - - for i, j in T.Parallel(BLOCK_M, BLOCK_N): - acc_score_kv[i, j] = T.if_then_else(i >= cur_q_seqlen or j >= cur_kv_seqlen, -1e9, 0) - - T.gemm(Q_shared, K_shared, acc_score_kv, transpose_B=True, policy=T.GemmWarpPolicy.FullRow) - - T.copy(scores_max, scores_max_prev) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - T.reduce_max(acc_score_kv, scores_max, dim=1, clear=False) - for i in T.Parallel(BLOCK_M): - scores_max[i] = T.max(scores_max[i], scores_max_prev[i]) - - for i in T.Parallel(BLOCK_M): - scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE) - - for i, j in T.Parallel(BLOCK_M, BLOCK_N): - acc_score_kv[i, j] = T.exp2(acc_score_kv[i, j] * SCALE - scores_max[i] * SCALE) - - T.reduce_sum(acc_score_kv, scores_sum, dim=1) - for i in T.Parallel(BLOCK_M): - log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i] - - T.copy(acc_score_kv, acc_score_kv_cast) - - # Scale previous output - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] *= scores_scale[i] - - T.copy(V[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], V_shared) - - # Accumulate current V contribution - T.gemm(acc_score_kv_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow) - - - # ========================== - # Stage 3: Finalize - # ========================== - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] /= log_sum[i] +""" +Diffulex Flash-Attn kernel wrappers. - T.copy(acc_output, O_shared) - for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM): - if i < cur_q_seqlen: - O[i + q_start_idx, head_idx, d_idx] = O_shared[i, d_idx] - - return kernel +Goals: +- Decode path should NOT require TileLang at import time. +- Prefill behavior remains unchanged (TileLang for block attention / flash-attn varlen otherwise), + but TileLang is imported lazily only when prefill is called. +""" +from __future__ import annotations -@tilelang.autotune(configs=build_configs()) -@tilelang.jit( - # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper - out_idx=[-1], - pass_configs={ - tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, - tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, - tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, - } -) -def dllm_flash_attn_decode_kernel_bf16_q_fp8_kv( - NUM_SEQS: int, - NUM_GROUPS: int, - NUM_PAGE_BLOCKS: int, - Q_LEN: int, - KV_LEN: int, - NUM_HEADS: int, - HEAD_DIM: int, - IS_BLOCK_ATTN: bool, - DIFFUSION_BLOCK_SIZE: int, - MAX_SEQ_NUM_BLOCKS: int, - PAGE_BLOCK_SIZE: int = 32, - BLOCK_M: int = 64, - BLOCK_N: int = 64, - NUM_STAGES: int = 1, - NUM_THREADS: int = 128, -): - SCALE = (1.0 / HEAD_DIM)**0.5 * 1.44269504 # log2(e) - NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS - Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] - KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM] - O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] - K_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM] - V_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM] - MAX_SEQ_NUM_BLOCKS = T.dynamic("MAX_SEQ_NUM_BLOCKS", 'int32') - BLOCK_TABLES_SHAPE = [NUM_SEQS, MAX_SEQ_NUM_BLOCKS] - DTYPE = "bfloat16" - ACCUM_DTYPE = "float32" - FP8_DTYPE = "float8_e4m3fn" - - @T.prim_func - def kernel( - Q: T.Tensor(Q_SHAPE, DTYPE), - K: T.Tensor(KV_SHAPE, DTYPE), - V: T.Tensor(KV_SHAPE, DTYPE), - K_Cache: T.Tensor(K_CACHE_SHAPE, FP8_DTYPE), - V_Cache: T.Tensor(V_CACHE_SHAPE, FP8_DTYPE), - K_Scale: T.Tensor([NUM_KV_HEADS], "float32"), - V_Scale: T.Tensor([NUM_KV_HEADS], "float32"), - block_tables: T.Tensor(BLOCK_TABLES_SHAPE, "int32"), - context_lens: T.Tensor(NUM_SEQS, "int32"), - cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"), - cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"), - max_seqlen_q: T.int32, - O: T.Tensor(O_SHAPE, DTYPE), - ): - with T.Kernel(NUM_SEQS, NUM_HEADS, threads=NUM_THREADS) as (bx, by): - Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) - K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) - V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) - O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) - - # KV cache shared staging buffers (BF16): - # HBM(FP8) -> T.copy (implicit cast) -> shared(BF16) -> GEMM - K_Cache_shared_bf16 = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE) - V_Cache_shared_bf16 = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE) - - acc_score_kv = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE) - acc_score_kv_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE) - acc_score_kvcache = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], ACCUM_DTYPE) - acc_score_kvcache_cast = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], DTYPE) - - acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE) - scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) - - T.annotate_layout({ - Q_shared: tilelang.layout.make_swizzled_layout(Q_shared), - O_shared: tilelang.layout.make_swizzled_layout(O_shared), - }) - - seq_idx = bx - head_idx = by - kv_head_idx = head_idx // NUM_GROUPS - - q_start_idx = cu_seqlens_q[seq_idx] - kv_start_idx = cu_seqlens_k[seq_idx] - q_end_idx = cu_seqlens_q[seq_idx + 1] - kv_end_idx = cu_seqlens_k[seq_idx + 1] - - cur_q_seqlen = q_end_idx - q_start_idx - cur_kv_seqlen = kv_end_idx - kv_start_idx - - cur_context_len = context_lens[seq_idx] - - T.copy(Q[q_start_idx : q_start_idx + BLOCK_M, head_idx, :], Q_shared) - - T.fill(acc_output, 0) - T.fill(acc_score_kv, 0) - T.fill(acc_score_kvcache, 0) - T.fill(log_sum, 0) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - - # ========================== - # Stage 1: KV Cache Attention (Context) - # ========================== - for page_block_idx_local in T.Pipelined(MAX_SEQ_NUM_BLOCKS, num_stages=NUM_STAGES): - page_block_idx_global = block_tables[seq_idx, page_block_idx_local] - if page_block_idx_global >= 0: - # Step 1: Load FP8 K_Cache, implicit cast to BF16 (vectorized path). - # K_Scale will be applied on scores (much cheaper than scaling K elementwise). - T.copy(K_Cache[page_block_idx_global, :, kv_head_idx, :], K_Cache_shared_bf16) - - # Initialize scores with mask, then GEMM accumulates into it (masked entries remain ~-1e9). - for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE): - acc_score_kvcache[i, j] = T.if_then_else( - (i >= cur_q_seqlen or page_block_idx_local * PAGE_BLOCK_SIZE + j >= cur_context_len), - -1e9, - 0, - ) - - # Compute attention scores - T.gemm(Q_shared, K_Cache_shared_bf16, acc_score_kvcache, transpose_B=True, policy=T.GemmWarpPolicy.FullRow) - # Apply per-head K scale on scores: (Q·(K*ks)) == (Q·K) * ks - for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE): - acc_score_kvcache[i, j] *= K_Scale[kv_head_idx] - - # Compute online softmax - T.copy(scores_max, scores_max_prev) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - T.reduce_max(acc_score_kvcache, scores_max, dim=1, clear=False) - for i in T.Parallel(BLOCK_M): - scores_max[i] = T.max(scores_max[i], scores_max_prev[i]) - - for i in T.Parallel(BLOCK_M): - scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE) - - for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE): - acc_score_kvcache[i, j] = T.exp2(acc_score_kvcache[i, j] * SCALE - scores_max[i] * SCALE) - - T.reduce_sum(acc_score_kvcache, scores_sum, dim=1) - for i in T.Parallel(BLOCK_M): - log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i] - - # Cast weights to BF16 for V GEMM, fuse per-head V scale here: - # (softmax * (V*vs)) == ((softmax*vs) · V) - # Use separate loop to avoid layout infer conflict - for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE): - acc_score_kvcache_cast[i, j] = (acc_score_kvcache[i, j] * V_Scale[kv_head_idx]).astype(T.bfloat16) - - # Scale previous output accumulator - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] *= scores_scale[i] - - # Step 2: Load FP8 V_Cache, implicit cast to BF16 (vectorized path). - T.copy(V_Cache[page_block_idx_global, :, kv_head_idx, :], V_Cache_shared_bf16) - - # Accumulate current V_cache contribution using BF16 V_Cache shared buffer - T.gemm(acc_score_kvcache_cast, V_Cache_shared_bf16, acc_output, policy=T.GemmWarpPolicy.FullRow) - - if page_block_idx_local == MAX_SEQ_NUM_BLOCKS - 1: - # ========================== - # Stage 2: Fresh KV Attention (Self-Attn) - # ========================== - for idx in T.Pipelined(T.ceildiv(DIFFUSION_BLOCK_SIZE, BLOCK_N), num_stages=NUM_STAGES): - T.copy(K[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], K_shared) +import os - for i, j in T.Parallel(BLOCK_M, BLOCK_N): - acc_score_kv[i, j] = T.if_then_else(i >= cur_q_seqlen or j >= cur_kv_seqlen, -1e9, 0) - - T.gemm(Q_shared, K_shared, acc_score_kv, transpose_B=True, policy=T.GemmWarpPolicy.FullRow) - - T.copy(scores_max, scores_max_prev) - T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) - T.reduce_max(acc_score_kv, scores_max, dim=1, clear=False) - for i in T.Parallel(BLOCK_M): - scores_max[i] = T.max(scores_max[i], scores_max_prev[i]) - - for i in T.Parallel(BLOCK_M): - scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE) - - for i, j in T.Parallel(BLOCK_M, BLOCK_N): - acc_score_kv[i, j] = T.exp2(acc_score_kv[i, j] * SCALE - scores_max[i] * SCALE) - - T.reduce_sum(acc_score_kv, scores_sum, dim=1) - for i in T.Parallel(BLOCK_M): - log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i] - - T.copy(acc_score_kv, acc_score_kv_cast) - - # Scale previous output - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] *= scores_scale[i] - - T.copy(V[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], V_shared) - - # Accumulate current V contribution - T.gemm(acc_score_kv_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow) - - # ========================== - # Stage 3: Finalize - # ========================== - for i, j in T.Parallel(BLOCK_M, HEAD_DIM): - acc_output[i, j] /= log_sum[i] +import torch +from flash_attn import flash_attn_varlen_func - T.copy(acc_output, O_shared) - for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM): - if i < cur_q_seqlen: - O[i + q_start_idx, head_idx, d_idx] = O_shared[i, d_idx] - - return kernel +from diffulex.attention.metadata import AttnMetaDataBase +from diffulex_kernel.python.kv_cache_kernels import load_kvcache +from diffulex_kernel.python.paged_attn_decode_triton import paged_attn_decode_unified_triton -def _dllm_flash_attn_prefill_bf16( +def dllm_flash_attn_prefill( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: float, - attn_metadata: AttnMetaDataBase + attn_metadata: AttnMetaDataBase, ) -> torch.Tensor: - if attn_metadata.attn_type == "full_attention": - return flash_attn_varlen_func( - q, k, v, - attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k, - softmax_scale=scale, block_table=None - ) - elif attn_metadata.attn_type == "block_attention": - if is_warming_up(): - global kernel_config - with set_autotune_inputs([ - q, k, v, - attn_metadata.cu_seqlens_q, - attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, - ]): - prefill_kernel = dllm_flash_attn_prefill_kernel( - attn_metadata.num_seqs, - q.shape[1] // k.shape[1], - q.shape[0], - k.shape[0], - q.shape[1], - q.shape[2], - attn_metadata.attn_type == "block_attention", - attn_metadata.diffusion_block_size - ) - kernel_config = prefill_kernel.config - # CHECK_FLASH_ATTN_PREFILL( - # q, k, v, - # attn_metadata.cu_seqlens_q, - # attn_metadata.cu_seqlens_k, - # attn_metadata.max_seqlen_q, - # prefill_kernel, - # diffusion_block_size=attn_metadata.diffusion_block_size, - # is_block_attn=(attn_metadata.attn_type == "block_attention"), - # ) - return prefill_kernel( - q, k, v, - attn_metadata.cu_seqlens_q, - attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, - ) - else: - prefill_kernel = dllm_flash_attn_prefill_kernel( - attn_metadata.num_seqs, - q.shape[1] // k.shape[1], - q.shape[0], - k.shape[0], - q.shape[1], - q.shape[2], - attn_metadata.attn_type == "block_attention", - attn_metadata.diffusion_block_size, - **kernel_config - ) - return prefill_kernel( - q, k, v, - attn_metadata.cu_seqlens_q, - attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, - ) - + """ + Prefill attention wrapper. -def _dllm_flash_attn_decode_bf16( + TileLang is imported lazily so decode-only usage does not depend on TileLang. + """ + from diffulex_kernel.python.dllm_flash_attn_prefill_tilelang import ( + dllm_flash_attn_prefill_tilelang, + ) + + return dllm_flash_attn_prefill_tilelang(q, k, v, scale, attn_metadata) + + +def _decode_varlen( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor, scale: float, - attn_metadata: AttnMetaDataBase + attn_metadata: AttnMetaDataBase, ) -> torch.Tensor: - if attn_metadata.decode_mode == "static": - # Use kernel_config from prefill if available, otherwise use empty dict - config_kwargs = kernel_config if kernel_config is not None else {} - decode_kernel = dllm_flash_attn_decode_kernel( - attn_metadata.num_seqs, - q.shape[1] // k.shape[1], - k_cache.shape[0], - q.shape[0], - k.shape[0], - q.shape[1], - q.shape[2], - attn_metadata.attn_type == "block_attention", - attn_metadata.diffusion_block_size, - attn_metadata.block_tables.shape[1], - attn_metadata.page_block_size, - **config_kwargs + """ + Varlen decode path: + - gather/dequant KV cache with Triton `load_kvcache` + - run `flash_attn_varlen_func` + """ + do_profile = os.getenv("DIFFULEX_PROFILE_KVCACHE", "0") == "1" + if do_profile and q.is_cuda: + e0, e1, e2 = ( + torch.cuda.Event(enable_timing=True), + torch.cuda.Event(enable_timing=True), + torch.cuda.Event(enable_timing=True), ) - if not is_warming_up(): - CHECK_FLASH_ATTN_DECODE( - q, k, v, - k_cache, v_cache, - attn_metadata.block_tables, - attn_metadata.context_lens, - attn_metadata.cu_seqlens_q, - attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, - decode_kernel, - scale=scale, - num_groups=q.shape[1] // k.shape[1], - page_block_size=attn_metadata.page_block_size, - diffusion_block_size=attn_metadata.diffusion_block_size, - is_block_attn=(attn_metadata.attn_type == "block_attention"), - ) - - return decode_kernel( - q, k, v, k_cache, v_cache, - attn_metadata.block_tables, - attn_metadata.context_lens, + e0.record() + k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v) + e1.record() + out = flash_attn_varlen_func( + q, + k_comb, + v_comb, attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k, attn_metadata.max_seqlen_q, + attn_metadata.max_seqlen_k, + softmax_scale=scale, + block_table=None, ) - elif attn_metadata.decode_mode == "varlen": - do_profile = os.getenv("DIFFULEX_PROFILE_KVCACHE", "0") == "1" - if do_profile and q.is_cuda: - e0, e1, e2 = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) - e0.record() - k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v) - e1.record() - out = flash_attn_varlen_func( - q, k_comb, v_comb, - attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k, - softmax_scale=scale, block_table=None - ) - e2.record() - e2.synchronize() - print( - f"[DIFFULEX_PROFILE_KVCACHE] decode(varlen,bf16kv) " - f"load_kvcache={e0.elapsed_time(e1):.3f}ms flash_attn={e1.elapsed_time(e2):.3f}ms" - ) - return out - else: - k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v) - return flash_attn_varlen_func( - q, k_comb, v_comb, - attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k, - softmax_scale=scale, block_table=None - ) + e2.record() + e2.synchronize() + print( + f"[DIFFULEX_PROFILE_KVCACHE] decode(varlen) " + f"load_kvcache={e0.elapsed_time(e1):.3f}ms flash_attn={e1.elapsed_time(e2):.3f}ms" + ) + return out + + k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v) + return flash_attn_varlen_func( + q, + k_comb, + v_comb, + attn_metadata.cu_seqlens_q, + attn_metadata.cu_seqlens_k, + attn_metadata.max_seqlen_q, + attn_metadata.max_seqlen_k, + softmax_scale=scale, + block_table=None, + ) -def _dllm_flash_attn_decode_bf16_q_fp8_kv( +def _decode_static_unified_triton_bf16( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor, scale: float, - attn_metadata: AttnMetaDataBase + attn_metadata: AttnMetaDataBase, ) -> torch.Tensor: - """BF16 Q + FP8 KV decode helper function that uses BF16-Q/FP8-KV kernel with internal dequantization.""" - if attn_metadata.k_scale is None or attn_metadata.v_scale is None: - raise ValueError("FP8 KV decode requires k_scale and v_scale in metadata") - - # KV cache is stored as uint8 for FP8, but TileLang expects float8 view dtype. - from diffulex.utils.quantization.context import get_kv_cache_strategy - strategy = get_kv_cache_strategy() - if strategy is None or getattr(strategy, "kv_cache_format", "bf16") != "fp8": - raise ValueError(f"Expected kv_cache_format='fp8', got strategy={type(strategy)}") - k_cache = strategy.view_kv_cache_for_kernels(k_cache) - v_cache = strategy.view_kv_cache_for_kernels(v_cache) - - if attn_metadata.decode_mode == "static": - global kernel_config_bf16_q_fp8_kv_decode - common_args = ( - attn_metadata.num_seqs, - q.shape[1] // k.shape[1], - k_cache.shape[0], - q.shape[0], - k.shape[0], - q.shape[1], - q.shape[2], - attn_metadata.attn_type == "block_attention", - attn_metadata.diffusion_block_size, - attn_metadata.block_tables.shape[1], - attn_metadata.page_block_size, - ) - - # BF16-Q/FP8-KV decode needs its own autotuned config; do not reuse prefill/BF16 config. - # In some environments, TileLang autotuning may fail (e.g. no valid configs compile/validate). - # In that case, fall back to the varlen path (Python dequant + flash-attn varlen) for correctness. - try: - if is_warming_up() or kernel_config_bf16_q_fp8_kv_decode is None: - with set_autotune_inputs([ - q, k, v, - k_cache, v_cache, - attn_metadata.k_scale, - attn_metadata.v_scale, - attn_metadata.block_tables, - attn_metadata.context_lens, - attn_metadata.cu_seqlens_q, - attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, - ]): - decode_kernel = dllm_flash_attn_decode_kernel_bf16_q_fp8_kv(*common_args) - kernel_config_bf16_q_fp8_kv_decode = decode_kernel.config - else: - decode_kernel = dllm_flash_attn_decode_kernel_bf16_q_fp8_kv( - *common_args, - **kernel_config_bf16_q_fp8_kv_decode, - ) - - return decode_kernel( - q, k, v, k_cache, v_cache, - attn_metadata.k_scale, # Pass K scale - attn_metadata.v_scale, # Pass V scale - attn_metadata.block_tables, - attn_metadata.context_lens, - attn_metadata.cu_seqlens_q, - attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, - ) - except RuntimeError as e: - # Fall back if autotuning or runtime validation fails. - if "Auto-tuning failed" in str(e) or "No configuration" in str(e): - k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v) - return flash_attn_varlen_func( - q, k_comb, v_comb, - attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k, - softmax_scale=scale, block_table=None - ) - raise - elif attn_metadata.decode_mode == "varlen": - # varlen模式使用load_kvcache:FP8 反量化/scale 融合应在 load_kvcache 内完成(Triton fused kernel) - do_profile = os.getenv("DIFFULEX_PROFILE_KVCACHE", "0") == "1" - if do_profile and q.is_cuda: - e0, e1, e2 = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) - e0.record() - k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v) - e1.record() - out = flash_attn_varlen_func( - q, k_comb, v_comb, - attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k, - softmax_scale=scale, block_table=None - ) - e2.record() - e2.synchronize() - print( - f"[DIFFULEX_PROFILE_KVCACHE] decode(varlen,fp8kv) " - f"load_kvcache={e0.elapsed_time(e1):.3f}ms flash_attn={e1.elapsed_time(e2):.3f}ms" - ) - return out - else: - k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v) - return flash_attn_varlen_func( - q, k_comb, v_comb, - attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k, - attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k, - softmax_scale=scale, block_table=None - ) - else: - raise ValueError(f"Unsupported decode mode: {attn_metadata.decode_mode}") + return paged_attn_decode_unified_triton( + q, + k, + v, + k_cache, + v_cache, + attn_metadata, + softmax_scale=scale, + fp8_cache=False, + ) -def dllm_flash_attn_prefill( +def _decode_static_unified_triton_fp8_cache( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, scale: float, - attn_metadata: AttnMetaDataBase + attn_metadata: AttnMetaDataBase, ) -> torch.Tensor: - """ - Prefill attention wrapper that dynamically selects kernel based on quantization strategy. - - Args: - q: Query tensor [Q_LEN, NUM_HEADS, HEAD_DIM] - k: Key tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM] - v: Value tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM] - scale: Attention scale factor - attn_metadata: Attention metadata - - Returns: - Output tensor [Q_LEN, NUM_HEADS, HEAD_DIM] - """ + if attn_metadata.k_scale is None or attn_metadata.v_scale is None: + raise ValueError("FP8 KV decode requires k_scale and v_scale in metadata") + + # KV cache is stored as uint8 for FP8, but Triton expects float8 view dtype. from diffulex.utils.quantization.context import get_kv_cache_strategy - kv_strategy = get_kv_cache_strategy() - kv_fmt = getattr(kv_strategy, "kv_cache_format", "bf16") if kv_strategy is not None else "bf16" - # Q always uses BF16 (attn_q quantization is not supported) - q_fmt = "bf16" + strategy = get_kv_cache_strategy() + if strategy is None or getattr(strategy, "kv_cache_format", "bf16") != "fp8": + raise ValueError(f"Expected kv_cache_format='fp8', got strategy={type(strategy)}") - # Prefill currently uses BF16 kernels for all formats (FP8 prefill kernel TBD). - if q_fmt == "bf16" and kv_fmt in ("bf16", "fp8"): - return _dllm_flash_attn_prefill_bf16(q, k, v, scale, attn_metadata) - raise ValueError( - f"Unsupported q_format={q_fmt!r} / kv_cache_format={kv_fmt!r} for prefill" + k_cache_fp8 = strategy.view_kv_cache_for_kernels(k_cache) + v_cache_fp8 = strategy.view_kv_cache_for_kernels(v_cache) + + return paged_attn_decode_unified_triton( + q, + k, + v, + k_cache_fp8, + v_cache_fp8, + attn_metadata, + softmax_scale=scale, + fp8_cache=True, ) @@ -892,40 +157,37 @@ def dllm_flash_attn_decode( k_cache: torch.Tensor, v_cache: torch.Tensor, scale: float, - attn_metadata: AttnMetaDataBase + attn_metadata: AttnMetaDataBase, ) -> torch.Tensor: """ - Decode attention wrapper that dynamically selects kernel based on quantization strategy. - - Args: - q: Query tensor [Q_LEN, NUM_HEADS, HEAD_DIM] - k: Key tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM] - v: Value tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM] - k_cache: Key cache tensor (shape depends on layout) - v_cache: Value cache tensor (shape depends on layout) - scale: Attention scale factor - attn_metadata: Attention metadata - - Returns: - Output tensor [Q_LEN, NUM_HEADS, HEAD_DIM] - - Note: - For FP8 strategy: - - Unified layout static mode: dequantization + scale fusion are handled inside the TileLang FP8 decode kernel - - Unified layout varlen mode: dequantization is handled by load_kvcache (Python path) - - Distinct layout: dequantization is handled by load_kvcache (Python path) + Decode attention wrapper: + - static: Triton paged-attention over (paged) KV cache + current-step KV + - varlen: load_kvcache (Triton gather/dequant) + flash-attn varlen """ from diffulex.utils.quantization.context import get_kv_cache_strategy + kv_strategy = get_kv_cache_strategy() kv_fmt = getattr(kv_strategy, "kv_cache_format", "bf16") if kv_strategy is not None else "bf16" - # Q always uses BF16 (attn_q quantization is not supported) - q_fmt = "bf16" + decode_mode = getattr(attn_metadata, "decode_mode", "varlen") + if decode_mode == "static": + # Only unified layout is supported in static paged-attention for now. + if getattr(attn_metadata, "kv_cache_layout", "unified") != "unified": + return _decode_varlen(q, k, v, k_cache, v_cache, scale, attn_metadata) + + if kv_fmt == "bf16": + return _decode_static_unified_triton_bf16(q, k, v, k_cache, v_cache, scale, attn_metadata) + if kv_fmt == "fp8": + return _decode_static_unified_triton_fp8_cache(q, k, v, k_cache, v_cache, scale, attn_metadata) + raise ValueError(f"Unsupported kv_cache_format={kv_fmt!r} for static decode") + + if decode_mode == "varlen": + return _decode_varlen(q, k, v, k_cache, v_cache, scale, attn_metadata) + + raise ValueError(f"Unsupported decode mode: {decode_mode!r}") + - if q_fmt == "bf16" and kv_fmt == "bf16": - return _dllm_flash_attn_decode_bf16(q, k, v, k_cache, v_cache, scale, attn_metadata) - if q_fmt == "bf16" and kv_fmt == "fp8": - return _dllm_flash_attn_decode_bf16_q_fp8_kv(q, k, v, k_cache, v_cache, scale, attn_metadata) - raise ValueError( - f"Unsupported q_format={q_fmt!r} / kv_cache_format={kv_fmt!r} for decode" - ) \ No newline at end of file +__all__ = [ + "dllm_flash_attn_prefill", + "dllm_flash_attn_decode", +] diff --git a/diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py b/diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py new file mode 100644 index 0000000..17dfaf9 --- /dev/null +++ b/diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py @@ -0,0 +1,250 @@ +import torch +import tilelang +import tilelang.language as T + +from flash_attn import flash_attn_varlen_func +from tilelang.autotuner import set_autotune_inputs + +from diffulex_kernel.python.auto_tuner import build_configs +from diffulex.attention.metadata import AttnMetaDataBase, is_warming_up + + +kernel_config = None + + +@tilelang.autotune(configs=build_configs()) +@tilelang.jit( + # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper + out_idx=[-1], + pass_configs={ + tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, + tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, + tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, + }, +) +def dllm_flash_attn_prefill_kernel( + NUM_SEQS: int, + NUM_GROUPS: int, + Q_LEN: int, + KV_LEN: int, + NUM_HEADS: int, + HEAD_DIM: int, + IS_BLOCK_ATTN: bool, + DIFFUSION_BLOCK_SIZE: int, + BLOCK_M: int = 64, + BLOCK_N: int = 64, + NUM_STAGES: int = 1, + NUM_THREADS: int = 128, +): + SCALE = (1.0 / HEAD_DIM) ** 0.5 * 1.44269504 # log2(e) + NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS + Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] + KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM] + O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM] + DTYPE = "bfloat16" + ACCUM_DTYPE = "float" + + @T.prim_func + def kernel( + Q: T.Tensor(Q_SHAPE, DTYPE), + K: T.Tensor(KV_SHAPE, DTYPE), + V: T.Tensor(KV_SHAPE, DTYPE), + cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"), + cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"), + max_seqlen_q: T.int32, + O: T.Tensor(O_SHAPE, DTYPE), + ): + with T.Kernel(T.ceildiv(max_seqlen_q, BLOCK_M), NUM_HEADS, NUM_SEQS, threads=NUM_THREADS) as (bx, by, bz): + Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) + K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) + V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE) + O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE) + + acc_score = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE) + acc_score_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE) + acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE) + scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) + scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) + scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) + scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) + log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE) + + T.annotate_layout( + { + Q_shared: tilelang.layout.make_swizzled_layout(Q_shared), + O_shared: tilelang.layout.make_swizzled_layout(O_shared), + } + ) + + q_block_idx = bx + seq_idx = bz + head_idx = by + kv_head_idx = head_idx // NUM_GROUPS + + q_start_idx = cu_seqlens_q[seq_idx] + kv_start_idx = cu_seqlens_k[seq_idx] + q_end_idx = cu_seqlens_q[seq_idx + 1] + kv_end_idx = cu_seqlens_k[seq_idx + 1] + + cur_q_seqlen = q_end_idx - q_start_idx + cur_kv_seqlen = kv_end_idx - kv_start_idx + + T.copy( + Q[q_start_idx + q_block_idx * BLOCK_M : q_start_idx + (q_block_idx + 1) * BLOCK_M, head_idx, :], + Q_shared, + ) + + T.fill(acc_output, 0) + T.fill(acc_score, 0) + T.fill(log_sum, 0) + T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) + + loop_range = ( + T.min( + T.ceildiv(cur_q_seqlen + (q_block_idx + 1) * BLOCK_M, BLOCK_N), + T.ceildiv(cur_kv_seqlen, BLOCK_N), + ) + if IS_BLOCK_ATTN + else T.ceildiv(cur_kv_seqlen, BLOCK_N) + ) + for kv_block_idx in T.Pipelined(loop_range, num_stages=NUM_STAGES): + T.copy( + K[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :], + K_shared, + ) + + if IS_BLOCK_ATTN: + for i, j in T.Parallel(BLOCK_M, BLOCK_N): + num_diffusion_blocks = (q_block_idx * BLOCK_M + i) // DIFFUSION_BLOCK_SIZE + 1 + acc_score[i, j] = T.if_then_else( + (num_diffusion_blocks * DIFFUSION_BLOCK_SIZE <= kv_block_idx * BLOCK_N + j) + or (q_block_idx * BLOCK_M + i >= cur_q_seqlen or kv_block_idx * BLOCK_N + j >= cur_kv_seqlen), + -1e9, + 0, + ) + else: + for i, j in T.Parallel(BLOCK_M, BLOCK_N): + acc_score[i, j] = T.if_then_else( + (q_block_idx * BLOCK_M + i >= cur_q_seqlen or kv_block_idx * BLOCK_N + j >= cur_kv_seqlen), + -1e9, + 0, + ) + + T.gemm(Q_shared, K_shared, acc_score, transpose_B=True, policy=T.GemmWarpPolicy.FullRow) + + T.copy(scores_max, scores_max_prev) + T.fill(scores_max, -T.infinity(ACCUM_DTYPE)) + T.reduce_max(acc_score, scores_max, dim=1, clear=False) + for i in T.Parallel(BLOCK_M): + scores_max[i] = T.max(scores_max[i], scores_max_prev[i]) + + for i in T.parallel(BLOCK_M): + scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE) + + for i, j in T.Parallel(BLOCK_M, BLOCK_N): + acc_score[i, j] = T.exp2(acc_score[i, j] * SCALE - scores_max[i] * SCALE) + + T.reduce_sum(acc_score, scores_sum, dim=1) + for i in T.Parallel(BLOCK_M): + log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i] + + T.copy(acc_score, acc_score_cast) + for i, j in T.Parallel(BLOCK_M, HEAD_DIM): + acc_output[i, j] *= scores_scale[i] + + T.copy( + V[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :], + V_shared, + ) + T.gemm(acc_score_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow) + + for i, j in T.Parallel(BLOCK_M, HEAD_DIM): + acc_output[i, j] /= log_sum[i] + + T.copy(acc_output, O_shared) + for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM): + if i + q_block_idx * BLOCK_M < cur_q_seqlen: + O[i + q_start_idx + q_block_idx * BLOCK_M, head_idx, d_idx] = O_shared[i, d_idx] + + return kernel + + +def dllm_flash_attn_prefill_tilelang( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: float, + attn_metadata: AttnMetaDataBase, +) -> torch.Tensor: + """ + TileLang-based prefill implementation (existing behavior). + Kept in a separate module so importing decode kernels doesn't require TileLang. + """ + global kernel_config + if attn_metadata.attn_type == "full_attention": + return flash_attn_varlen_func( + q, + k, + v, + attn_metadata.cu_seqlens_q, + attn_metadata.cu_seqlens_k, + attn_metadata.max_seqlen_q, + attn_metadata.max_seqlen_k, + softmax_scale=scale, + block_table=None, + ) + if attn_metadata.attn_type != "block_attention": + raise ValueError(f"Unsupported attn_type={attn_metadata.attn_type!r} for prefill") + + if is_warming_up(): + with set_autotune_inputs( + [ + q, + k, + v, + attn_metadata.cu_seqlens_q, + attn_metadata.cu_seqlens_k, + attn_metadata.max_seqlen_q, + ] + ): + prefill_kernel = dllm_flash_attn_prefill_kernel( + attn_metadata.num_seqs, + q.shape[1] // k.shape[1], + q.shape[0], + k.shape[0], + q.shape[1], + q.shape[2], + attn_metadata.attn_type == "block_attention", + attn_metadata.diffusion_block_size, + ) + kernel_config = prefill_kernel.config + return prefill_kernel( + q, + k, + v, + attn_metadata.cu_seqlens_q, + attn_metadata.cu_seqlens_k, + attn_metadata.max_seqlen_q, + ) + + config_kwargs = kernel_config if kernel_config is not None else {} + prefill_kernel = dllm_flash_attn_prefill_kernel( + attn_metadata.num_seqs, + q.shape[1] // k.shape[1], + q.shape[0], + k.shape[0], + q.shape[1], + q.shape[2], + attn_metadata.attn_type == "block_attention", + attn_metadata.diffusion_block_size, + **config_kwargs, + ) + return prefill_kernel( + q, + k, + v, + attn_metadata.cu_seqlens_q, + attn_metadata.cu_seqlens_k, + attn_metadata.max_seqlen_q, + ) + diff --git a/diffulex_kernel/python/paged_attn_decode_triton.py b/diffulex_kernel/python/paged_attn_decode_triton.py new file mode 100644 index 0000000..1fabf19 --- /dev/null +++ b/diffulex_kernel/python/paged_attn_decode_triton.py @@ -0,0 +1,661 @@ +import torch +import triton +import triton.language as tl + +import os + +from diffulex.attention.metadata import AttnMetaDataBase + + +@triton.jit +def _paged_decode_attn_unified_bf16_cache_kernel( + q_ptr, + k_ptr, + v_ptr, + k_cache_ptr, + v_cache_ptr, + block_tables_ptr, + context_lens_ptr, + cu_seqlens_q_ptr, + o_ptr, + softmax_scale, # fp32 scalar + # q/k/v/o strides + q_stride_s, + q_stride_h, + q_stride_d, + kv_stride_s, + kv_stride_h, + kv_stride_d, + o_stride_s, + o_stride_h, + o_stride_d, + # cache strides: [nblks, page, kvh, d] + k_cache_stride_nblks, + k_cache_stride_page, + k_cache_stride_h, + k_cache_stride_d, + v_cache_stride_nblks, + v_cache_stride_page, + v_cache_stride_h, + v_cache_stride_d, + # block_tables strides + block_tables_stride_s, + block_tables_stride_b, + # misc + NUM_GROUPS: tl.constexpr, + HEAD_DIM: tl.constexpr, + HEAD_DIM_PADDED: tl.constexpr, + PAGE_SIZE: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + pid_seq = tl.program_id(0) + pid_head = tl.program_id(1) + pid_m = tl.program_id(2) + + kv_head = pid_head // NUM_GROUPS + + q_start = tl.load(cu_seqlens_q_ptr + pid_seq).to(tl.int32) + q_end = tl.load(cu_seqlens_q_ptr + pid_seq + 1).to(tl.int32) + q_len = q_end - q_start + new_len = q_len # decode path: current-step KV length matches query length + context_len = tl.load(context_lens_ptr + pid_seq).to(tl.int32) + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_DIM_PADDED) + mask_m = offs_m < q_len + mask_d = offs_d < HEAD_DIM + + q_offs = (q_start + offs_m[:, None]) * q_stride_s + pid_head * q_stride_h + offs_d[None, :] * q_stride_d + q = tl.load(q_ptr + q_offs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.bfloat16) + + m = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_DIM_PADDED], dtype=tl.float32) + + # Cache stage: iterate only needed blocks (dynamic loop, like vLLM kernels). + offs_n_cache = tl.arange(0, BLOCK_N) + tok_off_cache = offs_n_cache + mask_n_cache = offs_n_cache < PAGE_SIZE + + num_cache_blocks = (context_len + PAGE_SIZE - 1) // PAGE_SIZE + for blk in range(0, num_cache_blocks): + page = tl.load(block_tables_ptr + pid_seq * block_tables_stride_s + blk * block_tables_stride_b).to(tl.int32) + tok_base = blk * PAGE_SIZE + tok_idx = tok_base + tok_off_cache + valid_tok = (page >= 0) & (tok_idx < context_len) & mask_n_cache + + k_offs = ( + page * k_cache_stride_nblks + + tok_off_cache[:, None] * k_cache_stride_page + + kv_head * k_cache_stride_h + + offs_d[None, :] * k_cache_stride_d + ) + k_blk = tl.load( + k_cache_ptr + k_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ).to(tl.bfloat16) + + scores = tl.dot(q, tl.trans(k_blk)).to(tl.float32) * softmax_scale + scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf")) + + m_new = tl.maximum(m, tl.max(scores, axis=1)) + p = tl.exp(scores - m_new[:, None]) + l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1) + alpha = tl.exp(m - m_new) + acc *= alpha[:, None] + + v_offs = ( + page * v_cache_stride_nblks + + tok_off_cache[:, None] * v_cache_stride_page + + kv_head * v_cache_stride_h + + offs_d[None, :] * v_cache_stride_d + ) + v_blk = tl.load( + v_cache_ptr + v_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ).to(tl.bfloat16) + + acc += tl.dot(p.to(tl.bfloat16), v_blk).to(tl.float32) + m = m_new + l = l_new + + # New KV stage (dynamic tiles) + kv_start = q_start + for start_n in range(0, new_len, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + valid_tok = offs_n < new_len + + k_offs = (kv_start + offs_n[None, :]) * kv_stride_s + kv_head * kv_stride_h + offs_d[:, None] * kv_stride_d + k_blk = tl.load( + k_ptr + k_offs, + mask=valid_tok[None, :] & mask_d[:, None], + other=0.0, + ).to(tl.bfloat16) + + scores = tl.dot(q, k_blk).to(tl.float32) * softmax_scale + scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf")) + + m_new = tl.maximum(m, tl.max(scores, axis=1)) + p = tl.exp(scores - m_new[:, None]) + l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1) + alpha = tl.exp(m - m_new) + acc *= alpha[:, None] + + v_offs = (kv_start + offs_n[:, None]) * kv_stride_s + kv_head * kv_stride_h + offs_d[None, :] * kv_stride_d + v_blk = tl.load( + v_ptr + v_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ).to(tl.bfloat16) + + acc += tl.dot(p.to(tl.bfloat16), v_blk).to(tl.float32) + m = m_new + l = l_new + + out = acc / l[:, None] + o_offs = (q_start + offs_m[:, None]) * o_stride_s + pid_head * o_stride_h + offs_d[None, :] * o_stride_d + tl.store(o_ptr + o_offs, out.to(tl.bfloat16), mask=mask_m[:, None] & mask_d[None, :]) + + +@triton.jit +def _paged_decode_attn_unified_fp8_cache_kernel_legacy( + q_ptr, + k_ptr, + v_ptr, + k_cache_ptr, + v_cache_ptr, + k_scale_ptr, + v_scale_ptr, + block_tables_ptr, + context_lens_ptr, + cu_seqlens_q_ptr, + o_ptr, + softmax_scale, # fp32 scalar + # q/k/v/o strides + q_stride_s, + q_stride_h, + q_stride_d, + kv_stride_s, + kv_stride_h, + kv_stride_d, + o_stride_s, + o_stride_h, + o_stride_d, + # cache strides: [nblks, page, kvh, d] + k_cache_stride_nblks, + k_cache_stride_page, + k_cache_stride_h, + k_cache_stride_d, + v_cache_stride_nblks, + v_cache_stride_page, + v_cache_stride_h, + v_cache_stride_d, + # block_tables strides + block_tables_stride_s, + block_tables_stride_b, + # misc + NUM_GROUPS: tl.constexpr, + HEAD_DIM: tl.constexpr, + HEAD_DIM_PADDED: tl.constexpr, + PAGE_SIZE: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + pid_seq = tl.program_id(0) + pid_head = tl.program_id(1) + pid_m = tl.program_id(2) + + kv_head = pid_head // NUM_GROUPS + k_scale = tl.load(k_scale_ptr + kv_head).to(tl.float32) + v_scale = tl.load(v_scale_ptr + kv_head).to(tl.float32) + + q_start = tl.load(cu_seqlens_q_ptr + pid_seq).to(tl.int32) + q_end = tl.load(cu_seqlens_q_ptr + pid_seq + 1).to(tl.int32) + q_len = q_end - q_start + new_len = q_len + context_len = tl.load(context_lens_ptr + pid_seq).to(tl.int32) + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_DIM_PADDED) + mask_m = offs_m < q_len + mask_d = offs_d < HEAD_DIM + + q_offs = (q_start + offs_m[:, None]) * q_stride_s + pid_head * q_stride_h + offs_d[None, :] * q_stride_d + q = tl.load(q_ptr + q_offs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.bfloat16) + + m = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_DIM_PADDED], dtype=tl.float32) + + offs_n_cache = tl.arange(0, BLOCK_N) + tok_off_cache = offs_n_cache + mask_n_cache = offs_n_cache < PAGE_SIZE + + num_cache_blocks = (context_len + PAGE_SIZE - 1) // PAGE_SIZE + for blk in range(0, num_cache_blocks): + page = tl.load(block_tables_ptr + pid_seq * block_tables_stride_s + blk * block_tables_stride_b).to(tl.int32) + tok_base = blk * PAGE_SIZE + tok_idx = tok_base + tok_off_cache + valid_tok = (page >= 0) & (tok_idx < context_len) & mask_n_cache + + k_offs = ( + page * k_cache_stride_nblks + + tok_off_cache[:, None] * k_cache_stride_page + + kv_head * k_cache_stride_h + + offs_d[None, :] * k_cache_stride_d + ) + # fp8 cache values: dot(Q, K_fp8) * k_scale == dot(Q, (K_fp8*k_scale)) + k_blk = tl.load( + k_cache_ptr + k_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ).to(tl.bfloat16) + + scores = tl.dot(q, tl.trans(k_blk)).to(tl.float32) * (softmax_scale * k_scale) + scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf")) + + m_new = tl.maximum(m, tl.max(scores, axis=1)) + p = tl.exp(scores - m_new[:, None]) + l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1) + alpha = tl.exp(m - m_new) + acc *= alpha[:, None] + + v_offs = ( + page * v_cache_stride_nblks + + tok_off_cache[:, None] * v_cache_stride_page + + kv_head * v_cache_stride_h + + offs_d[None, :] * v_cache_stride_d + ) + v_blk = tl.load( + v_cache_ptr + v_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ).to(tl.bfloat16) + + # Apply v_scale on weights (cheaper than scaling V elementwise). + acc += tl.dot((p * v_scale).to(tl.bfloat16), v_blk).to(tl.float32) + m = m_new + l = l_new + + kv_start = q_start + for start_n in range(0, new_len, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + valid_tok = offs_n < new_len + + k_offs = (kv_start + offs_n[None, :]) * kv_stride_s + kv_head * kv_stride_h + offs_d[:, None] * kv_stride_d + k_blk = tl.load( + k_ptr + k_offs, + mask=valid_tok[None, :] & mask_d[:, None], + other=0.0, + ).to(tl.bfloat16) + + scores = tl.dot(q, k_blk).to(tl.float32) * softmax_scale + scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf")) + + m_new = tl.maximum(m, tl.max(scores, axis=1)) + p = tl.exp(scores - m_new[:, None]) + l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1) + alpha = tl.exp(m - m_new) + acc *= alpha[:, None] + + v_offs = (kv_start + offs_n[:, None]) * kv_stride_s + kv_head * kv_stride_h + offs_d[None, :] * kv_stride_d + v_blk = tl.load( + v_ptr + v_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ).to(tl.bfloat16) + + acc += tl.dot(p.to(tl.bfloat16), v_blk).to(tl.float32) + m = m_new + l = l_new + + out = acc / l[:, None] + o_offs = (q_start + offs_m[:, None]) * o_stride_s + pid_head * o_stride_h + offs_d[None, :] * o_stride_d + tl.store(o_ptr + o_offs, out.to(tl.bfloat16), mask=mask_m[:, None] & mask_d[None, :]) + + +@triton.jit +def _paged_decode_attn_unified_fp8_cache_fused_dot_kernel( + q_ptr, + k_ptr, + v_ptr, + k_cache_ptr, + v_cache_ptr, + k_scale_ptr, + v_scale_ptr, + block_tables_ptr, + context_lens_ptr, + cu_seqlens_q_ptr, + o_ptr, + softmax_scale, # fp32 scalar + # q/k/v/o strides + q_stride_s, + q_stride_h, + q_stride_d, + kv_stride_s, + kv_stride_h, + kv_stride_d, + o_stride_s, + o_stride_h, + o_stride_d, + # cache strides: [nblks, page, kvh, d] + k_cache_stride_nblks, + k_cache_stride_page, + k_cache_stride_h, + k_cache_stride_d, + v_cache_stride_nblks, + v_cache_stride_page, + v_cache_stride_h, + v_cache_stride_d, + # block_tables strides + block_tables_stride_s, + block_tables_stride_b, + # misc + KV_FORMAT: tl.constexpr, + NUM_GROUPS: tl.constexpr, + HEAD_DIM: tl.constexpr, + HEAD_DIM_PADDED: tl.constexpr, + PAGE_SIZE: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + """ + FP8-cache decode kernel with *fused* fp8 math: + - Keep KV cache tiles in float8 (via fp8 view tensor) + - Use tl.dot_scaled(..., rhs_format="e4m3/e5m2") to consume fp8 without explicit dequant tensors + - Apply per-head scalar scales (k_scale/v_scale) without elementwise dequantization + """ + pid_seq = tl.program_id(0) + pid_head = tl.program_id(1) + pid_m = tl.program_id(2) + + kv_head = pid_head // NUM_GROUPS + k_scale = tl.load(k_scale_ptr + kv_head).to(tl.float32) + v_scale = tl.load(v_scale_ptr + kv_head).to(tl.float32) + + q_start = tl.load(cu_seqlens_q_ptr + pid_seq).to(tl.int32) + q_end = tl.load(cu_seqlens_q_ptr + pid_seq + 1).to(tl.int32) + q_len = q_end - q_start + new_len = q_len + context_len = tl.load(context_lens_ptr + pid_seq).to(tl.int32) + + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_DIM_PADDED) + mask_m = offs_m < q_len + mask_d = offs_d < HEAD_DIM + + # Load Q (bf16). Note: triton 3.5 `tl.dot` does not support mixed bf16/fp16 x fp8. + # We use `tl.dot_scaled` (microscaling) to accept fp8 operands. + q_offs = (q_start + offs_m[:, None]) * q_stride_s + pid_head * q_stride_h + offs_d[None, :] * q_stride_d + q = tl.load(q_ptr + q_offs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.bfloat16) + + m = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_DIM_PADDED], dtype=tl.float32) + + offs_n_cache = tl.arange(0, BLOCK_N) + tok_off_cache = offs_n_cache + mask_n_cache = offs_n_cache < PAGE_SIZE + + num_cache_blocks = (context_len + PAGE_SIZE - 1) // PAGE_SIZE + for blk in range(0, num_cache_blocks): + page = tl.load(block_tables_ptr + pid_seq * block_tables_stride_s + blk * block_tables_stride_b).to(tl.int32) + tok_base = blk * PAGE_SIZE + tok_idx = tok_base + tok_off_cache + valid_tok = (page >= 0) & (tok_idx < context_len) & mask_n_cache + + # K cache: keep fp8 element type; load as [K, N] to match dot_scaled rhs layout. + k_offs = ( + page * k_cache_stride_nblks + + tok_off_cache[None, :] * k_cache_stride_page + + kv_head * k_cache_stride_h + + offs_d[:, None] * k_cache_stride_d + ) + k_blk = tl.load( + k_cache_ptr + k_offs, + mask=mask_d[:, None] & valid_tok[None, :], + other=0.0, + ) + + # scores = QK^T * softmax_scale, with scalar k_scale applied after dot: + # dot(Q, K_true) == dot(Q, K_fp8) * k_scale (per-head scalar scale). + scores = tl.dot_scaled( + q, + None, + "bf16", + k_blk, + None, + KV_FORMAT, + ) * (softmax_scale * k_scale) + scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf")) + + m_new = tl.maximum(m, tl.max(scores, axis=1)) + p = tl.exp(scores - m_new[:, None]) + l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1) + alpha = tl.exp(m - m_new) + acc *= alpha[:, None] + + # V cache: keep fp8 element type for tl.dot. + v_offs = ( + page * v_cache_stride_nblks + + tok_off_cache[:, None] * v_cache_stride_page + + kv_head * v_cache_stride_h + + offs_d[None, :] * v_cache_stride_d + ) + v_blk = tl.load( + v_cache_ptr + v_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ) + + # acc += P @ V_true == (P @ V_fp8) * v_scale + acc += tl.dot_scaled( + p.to(tl.float16), + None, + "fp16", + v_blk, + None, + KV_FORMAT, + ) * v_scale + m = m_new + l = l_new + + # New KV stage (bf16 tensors, unchanged) + kv_start = q_start + for start_n in range(0, new_len, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + valid_tok = offs_n < new_len + + k_offs = (kv_start + offs_n[None, :]) * kv_stride_s + kv_head * kv_stride_h + offs_d[:, None] * kv_stride_d + k_blk = tl.load( + k_ptr + k_offs, + mask=valid_tok[None, :] & mask_d[:, None], + other=0.0, + ).to(tl.bfloat16) + + scores = tl.dot(q, k_blk, out_dtype=tl.float32) * softmax_scale + scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf")) + + m_new = tl.maximum(m, tl.max(scores, axis=1)) + p = tl.exp(scores - m_new[:, None]) + l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1) + alpha = tl.exp(m - m_new) + acc *= alpha[:, None] + + v_offs = (kv_start + offs_n[:, None]) * kv_stride_s + kv_head * kv_stride_h + offs_d[None, :] * kv_stride_d + v_blk = tl.load( + v_ptr + v_offs, + mask=valid_tok[:, None] & mask_d[None, :], + other=0.0, + ).to(tl.bfloat16) + + acc += tl.dot(p.to(tl.bfloat16), v_blk, out_dtype=tl.float32) + m = m_new + l = l_new + + out = acc / l[:, None] + o_offs = (q_start + offs_m[:, None]) * o_stride_s + pid_head * o_stride_h + offs_d[None, :] * o_stride_d + tl.store(o_ptr + o_offs, out.to(tl.bfloat16), mask=mask_m[:, None] & mask_d[None, :]) + + +def paged_attn_decode_unified_triton( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + attn_metadata: AttnMetaDataBase, + *, + softmax_scale: float, + fp8_cache: bool, +) -> torch.Tensor: + """ + Triton paged-attention decode for unified KV cache layout. + + q: [total_q, num_heads, head_dim] (bf16) + k/v: [total_q, num_kv_heads, head_dim] (bf16), aligned with cu_seqlens_q + k_cache/v_cache: + - bf16: [num_page_blocks, page_size, num_kv_heads, head_dim] + - fp8 : same shape but dtype must be float8 view for triton (strategy.view_kv_cache_for_kernels) + """ + assert q.is_cuda and k.is_cuda and v.is_cuda and k_cache.is_cuda and v_cache.is_cuda + assert q.dtype == torch.bfloat16 and k.dtype == torch.bfloat16 and v.dtype == torch.bfloat16 + assert attn_metadata.block_tables is not None and attn_metadata.context_lens is not None and attn_metadata.cu_seqlens_q is not None + assert attn_metadata.kv_cache_layout == "unified", f"only unified layout supported, got {attn_metadata.kv_cache_layout}" + + # Be robust to different metadata implementations (dataclass vs SimpleNamespace in tests). + num_seqs = int(attn_metadata.cu_seqlens_q.numel() - 1) + num_heads = q.shape[1] + head_dim = q.shape[2] + num_kv_heads = k.shape[1] + assert num_heads % num_kv_heads == 0 + num_groups = num_heads // num_kv_heads + + page_size = int(attn_metadata.page_block_size) + + # Heuristics: BLOCK_M = 64 (supports diffusion_block_size=32/64), BLOCK_N = page_size/new-tile + BLOCK_M = 64 + BLOCK_N = 32 if page_size <= 32 else 64 + # Cache stage requires BLOCK_N == PAGE_SIZE to simplify; enforce. + if BLOCK_N != page_size: + BLOCK_N = page_size + + head_dim_padded = 1 << (head_dim - 1).bit_length() + + o = torch.empty_like(q) + grid = (num_seqs, num_heads, triton.cdiv(int(attn_metadata.max_seqlen_q), BLOCK_M)) + + if fp8_cache: + if attn_metadata.k_scale is None or attn_metadata.v_scale is None: + raise ValueError("fp8_cache=True requires attn_metadata.k_scale/v_scale") + # Default to fused fp8-dot kernel; fallback to legacy on compile/runtime failures. + # Set DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT=0 to force legacy. + # Set DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT_STRICT=1 to raise instead of fallback. + use_fused_dot = os.getenv("DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT", "1") != "0" + strict_fused = os.getenv("DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT_STRICT", "0") == "1" + if use_fused_dot: + # `tl.dot_scaled` needs the fp8 format string to interpret raw bytes correctly. + # Derive from the fp8 view dtype (torch.float8_*). + dt = str(k_cache.dtype) + if "e4m3" in dt: + kv_format = "e4m3" + elif "e5m2" in dt: + kv_format = "e5m2" + else: + raise ValueError(f"Unsupported fp8 k_cache dtype for fused-dot: {k_cache.dtype}") + try: + _paged_decode_attn_unified_fp8_cache_fused_dot_kernel[grid]( + q, k, v, + k_cache, v_cache, + attn_metadata.k_scale, attn_metadata.v_scale, + attn_metadata.block_tables, + attn_metadata.context_lens, + attn_metadata.cu_seqlens_q, + o, + softmax_scale, + *q.stride(), *k.stride(), *o.stride(), + *k_cache.stride(), *v_cache.stride(), + *attn_metadata.block_tables.stride(), + KV_FORMAT=kv_format, + NUM_GROUPS=num_groups, + HEAD_DIM=head_dim, + HEAD_DIM_PADDED=head_dim_padded, + PAGE_SIZE=page_size, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + num_warps=4, + num_stages=2, + ) + except Exception: + if strict_fused: + raise + _paged_decode_attn_unified_fp8_cache_kernel_legacy[grid]( + q, k, v, + k_cache, v_cache, + attn_metadata.k_scale, attn_metadata.v_scale, + attn_metadata.block_tables, + attn_metadata.context_lens, + attn_metadata.cu_seqlens_q, + o, + softmax_scale, + *q.stride(), *k.stride(), *o.stride(), + *k_cache.stride(), *v_cache.stride(), + *attn_metadata.block_tables.stride(), + NUM_GROUPS=num_groups, + HEAD_DIM=head_dim, + HEAD_DIM_PADDED=head_dim_padded, + PAGE_SIZE=page_size, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + num_warps=4, + num_stages=2, + ) + else: + _paged_decode_attn_unified_fp8_cache_kernel_legacy[grid]( + q, k, v, + k_cache, v_cache, + attn_metadata.k_scale, attn_metadata.v_scale, + attn_metadata.block_tables, + attn_metadata.context_lens, + attn_metadata.cu_seqlens_q, + o, + softmax_scale, + *q.stride(), *k.stride(), *o.stride(), + *k_cache.stride(), *v_cache.stride(), + *attn_metadata.block_tables.stride(), + NUM_GROUPS=num_groups, + HEAD_DIM=head_dim, + HEAD_DIM_PADDED=head_dim_padded, + PAGE_SIZE=page_size, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + num_warps=4, + num_stages=2, + ) + else: + _paged_decode_attn_unified_bf16_cache_kernel[grid]( + q, k, v, + k_cache, v_cache, + attn_metadata.block_tables, + attn_metadata.context_lens, + attn_metadata.cu_seqlens_q, + o, + softmax_scale, + *q.stride(), *k.stride(), *o.stride(), + *k_cache.stride(), *v_cache.stride(), + *attn_metadata.block_tables.stride(), + NUM_GROUPS=num_groups, + HEAD_DIM=head_dim, + HEAD_DIM_PADDED=head_dim_padded, + PAGE_SIZE=page_size, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + num_warps=4, + num_stages=2, + ) + + return o + diff --git a/profile/analyze_trace_bottlenecks.py b/profile/analyze_trace_bottlenecks.py new file mode 100644 index 0000000..41821d3 --- /dev/null +++ b/profile/analyze_trace_bottlenecks.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +""" +Analyze huge torch chrome trace (streaming) to locate non-GEMM bottlenecks. + +Outputs: +- duration of user_annotation "diffulex.generate(profiled)" (wall-ish) +- GPU active time (union of kernel/memcpy/memset intervals) to estimate GPU idle gaps +- top CUDA runtime/driver API calls by CPU time + +Designed to work without loading the >2GB JSON into memory. +""" + +from __future__ import annotations + +import argparse +import json +from collections import Counter, defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Tuple + + +def _extract_str_after_key(line: str, key: str) -> Optional[str]: + k = f"\"{key}\"" + pos = line.find(k) + if pos < 0: + return None + colon = line.find(":", pos + len(k)) + if colon < 0: + return None + q1 = line.find('"', colon) + if q1 < 0: + return None + q2 = line.find('"', q1 + 1) + if q2 < 0: + return None + return line[q1 + 1 : q2] + + +def _extract_num_after_key(line: str, key: str) -> Optional[float]: + k = f"\"{key}\"" + pos = line.find(k) + if pos < 0: + return None + colon = line.find(":", pos + len(k)) + if colon < 0: + return None + frag = line[colon + 1 :].strip() + comma = frag.find(",") + if comma >= 0: + frag = frag[:comma] + try: + return float(frag.strip()) + except Exception: + return None + + +def _extract_json_object_value(line: str, key: str) -> Optional[Any]: + """ + Extract JSON object/array value following `"key":` on the same line. + Assumes the value is a JSON object {...} or array [...] and is fully contained in the line. + """ + k = f"\"{key}\"" + pos = line.find(k) + if pos < 0: + return None + colon = line.find(":", pos + len(k)) + if colon < 0: + return None + # find first '{' or '[' after colon + start = None + for i in range(colon, len(line)): + if line[i] == "{": + start = i + open_ch, close_ch = "{", "}" + break + if line[i] == "[": + start = i + open_ch, close_ch = "[", "]" + break + if start is None: + return None + depth = 0 + end = None + for i in range(start, len(line)): + ch = line[i] + if ch == open_ch: + depth += 1 + elif ch == close_ch: + depth -= 1 + if depth == 0: + end = i + 1 + break + if end is None: + return None + frag = line[start:end] + try: + return json.loads(frag) + except Exception: + return None + + +@dataclass +class Interval: + start: float + end: float + + +def _merge_intervals(intervals: List[Interval]) -> List[Interval]: + if not intervals: + return [] + intervals.sort(key=lambda x: x.start) + merged: List[Interval] = [intervals[0]] + for it in intervals[1:]: + last = merged[-1] + if it.start <= last.end: + if it.end > last.end: + last.end = it.end + else: + merged.append(it) + return merged + + +def analyze(trace_path: Path) -> Dict[str, Any]: + # union intervals for GPU activity across all streams + gpu_intervals: List[Interval] = [] + gpu_min_ts: Optional[float] = None + gpu_max_end: Optional[float] = None + + # also per stream, to detect if one stream is idle most of the time + gpu_intervals_by_stream: Dict[int, List[Interval]] = defaultdict(list) + + # user annotation + generate_dur_us: Optional[float] = None + + # runtime/driver api durations (cpu-side) + cuda_runtime: Counter[str] = Counter() + cuda_driver: Counter[str] = Counter() + + in_events = False + in_obj = False + depth = 0 + buf: List[str] = [] + + def _consume_event(text: str) -> None: + nonlocal generate_dur_us, gpu_min_ts, gpu_max_end + # quick checks without json parsing + if '"cat"' not in text or '"name"' not in text: + return + cat = None + name = None + # extract cat/name + # cat and name appear on first line typically, but safe on full text. + for line in text.splitlines(): + if cat is None and '"cat"' in line: + v = _extract_str_after_key(line, "cat") + if v: + cat = v + if name is None and '"name"' in line: + v = _extract_str_after_key(line, "name") + if v: + name = v + if cat is not None and name is not None: + break + if cat is None or name is None: + return + + if cat == "user_annotation" and name == "diffulex.generate(profiled)": + # duration in us + for line in text.splitlines(): + if '"dur"' in line: + d = _extract_num_after_key(line, "dur") + if d is not None: + generate_dur_us = d + break + return + + # cuda runtime/driver (CPU) + if cat == "cuda_runtime": + d = None + for line in text.splitlines(): + if '"dur"' in line: + d = _extract_num_after_key(line, "dur") + break + if d is not None: + cuda_runtime[name] += d + return + if cat == "cuda_driver": + d = None + for line in text.splitlines(): + if '"dur"' in line: + d = _extract_num_after_key(line, "dur") + break + if d is not None: + cuda_driver[name] += d + return + + # GPU activity events + if cat in ("kernel", "gpu_memcpy", "gpu_memset"): + ts = None + dur = None + stream = None + for line in text.splitlines(): + if ts is None and '"ts"' in line: + ts = _extract_num_after_key(line, "ts") + if dur is None and '"dur"' in line: + dur = _extract_num_after_key(line, "dur") + if stream is None and '"args"' in line and "stream" in line: + # args is often multi-line; rely on json fragment extraction when seen + pass + # extract args object to fetch stream quickly (safe, small) + args_obj = None + for line in text.splitlines(): + if '"args"' in line: + args_obj = _extract_json_object_value(line, "args") + break + if isinstance(args_obj, dict): + try: + stream = int(args_obj.get("stream", -1)) + except Exception: + stream = None + if ts is None or dur is None: + return + start = ts + end = ts + dur + gpu_intervals.append(Interval(start, end)) + if stream is not None and stream >= 0: + gpu_intervals_by_stream[stream].append(Interval(start, end)) + gpu_min_ts = start if gpu_min_ts is None else min(gpu_min_ts, start) + gpu_max_end = end if gpu_max_end is None else max(gpu_max_end, end) + return + + with trace_path.open("r", encoding="utf-8", errors="replace") as f: + for line in f: + if not in_events: + if '"traceEvents"' in line and "[" in line: + in_events = True + continue + if not in_obj: + if line.lstrip().startswith("{"): + in_obj = True + buf = [line] + depth = line.count("{") - line.count("}") + else: + if line.lstrip().startswith("]"): + break + continue + else: + buf.append(line) + depth += line.count("{") - line.count("}") + if in_obj and depth <= 0: + _consume_event("".join(buf)) + in_obj = False + + merged = _merge_intervals(gpu_intervals) + active_us = sum(it.end - it.start for it in merged) + span_us = (gpu_max_end - gpu_min_ts) if (gpu_min_ts is not None and gpu_max_end is not None) else 0.0 + + per_stream_active: Dict[int, float] = {} + for s, ints in gpu_intervals_by_stream.items(): + m = _merge_intervals(ints) + per_stream_active[s] = sum(it.end - it.start for it in m) + + top_runtime = cuda_runtime.most_common(30) + top_driver = cuda_driver.most_common(30) + + return { + "trace": str(trace_path), + "generate_dur_us": generate_dur_us, + "gpu_active_union_us": active_us, + "gpu_span_us": span_us, + "gpu_active_ratio_union_over_span": (active_us / span_us) if span_us > 0 else None, + "gpu_active_ratio_union_over_generate": (active_us / generate_dur_us) if (generate_dur_us and generate_dur_us > 0) else None, + "gpu_span_over_generate": (span_us / generate_dur_us) if (generate_dur_us and generate_dur_us > 0) else None, + "gpu_event_count": len(gpu_intervals), + "gpu_stream_count": len(per_stream_active), + "top_cuda_runtime_us": top_runtime, + "top_cuda_driver_us": top_driver, + "top_stream_active_us": sorted(per_stream_active.items(), key=lambda kv: kv[1], reverse=True)[:10], + } + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--trace", type=str, required=True) + ap.add_argument("--out", type=str, required=True) + args = ap.parse_args() + + res = analyze(Path(args.trace)) + out_path = Path(args.out) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(res, indent=2), encoding="utf-8") + print(f"[OK] wrote: {out_path}") + + +if __name__ == "__main__": + main() + diff --git a/profile/analyze_trace_cpu_ops.py b/profile/analyze_trace_cpu_ops.py new file mode 100644 index 0000000..c08b05c --- /dev/null +++ b/profile/analyze_trace_cpu_ops.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Stream-aggregate CPU-side durations from huge torch chrome traces. + +We aggregate: +- cat=cpu_op +- cat=python_function +- cat=user_annotation + +This helps answer: where is the extra walltime coming from (outside CUDA kernels)? +""" + +from __future__ import annotations + +import argparse +import json +from collections import Counter, defaultdict +from pathlib import Path +from typing import Dict, Optional, Tuple + + +def _extract_str_after_key(s: str, key: str) -> Optional[str]: + k = f"\"{key}\"" + pos = s.find(k) + if pos < 0: + return None + colon = s.find(":", pos + len(k)) + if colon < 0: + return None + q1 = s.find('"', colon) + if q1 < 0: + return None + q2 = s.find('"', q1 + 1) + if q2 < 0: + return None + return s[q1 + 1 : q2] + + +def _extract_num_after_key(s: str, key: str) -> Optional[float]: + k = f"\"{key}\"" + pos = s.find(k) + if pos < 0: + return None + colon = s.find(":", pos + len(k)) + if colon < 0: + return None + frag = s[colon + 1 :].strip() + comma = frag.find(",") + if comma >= 0: + frag = frag[:comma] + try: + return float(frag.strip()) + except Exception: + return None + + +def analyze(trace_path: Path, cats: Tuple[str, ...]) -> Dict[str, Dict[str, Dict[str, float]]]: + # cat -> name -> (dur_us_sum, calls) + dur: Dict[str, Counter[str]] = {c: Counter() for c in cats} + calls: Dict[str, Counter[str]] = {c: Counter() for c in cats} + + in_events = False + in_obj = False + depth = 0 + buf = [] + + def consume(text: str) -> None: + if '"cat"' not in text or '"name"' not in text: + return + cat = None + name = None + d = None + for line in text.splitlines(): + if cat is None and '"cat"' in line: + cat = _extract_str_after_key(line, "cat") + if name is None and '"name"' in line: + name = _extract_str_after_key(line, "name") + if d is None and '"dur"' in line: + d = _extract_num_after_key(line, "dur") + if cat and name and d is not None: + break + if cat not in cats or name is None: + return + calls[cat][name] += 1 + if d is not None: + dur[cat][name] += d + + with trace_path.open("r", encoding="utf-8", errors="replace") as f: + for line in f: + if not in_events: + if '"traceEvents"' in line and "[" in line: + in_events = True + continue + if not in_obj: + if line.lstrip().startswith("{"): + in_obj = True + buf = [line] + depth = line.count("{") - line.count("}") + else: + if line.lstrip().startswith("]"): + break + continue + else: + buf.append(line) + depth += line.count("{") - line.count("}") + if in_obj and depth <= 0: + consume("".join(buf)) + in_obj = False + + out: Dict[str, Dict[str, Dict[str, float]]] = {} + for c in cats: + out[c] = {} + for name, total in dur[c].items(): + out[c][name] = { + "dur_us": float(total), + "calls": float(calls[c][name]), + "avg_us": float(total) / float(calls[c][name]) if calls[c][name] else 0.0, + } + return out + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--trace", type=str, required=True) + ap.add_argument("--out", type=str, required=True) + ap.add_argument("--topk", type=int, default=50) + args = ap.parse_args() + + cats = ("cpu_op", "python_function", "user_annotation") + res = analyze(Path(args.trace), cats) + + # Write a compact report: per-cat topk by dur. + lines = [] + lines.append(f"Trace: {args.trace}") + lines.append("") + for c in cats: + items = sorted(res[c].items(), key=lambda kv: kv[1]["dur_us"], reverse=True)[: args.topk] + lines.append(f"== {c} top {args.topk} by dur_us ==") + for name, st in items: + lines.append(f"{st['dur_us']:.3f} us calls={int(st['calls'])} avg={st['avg_us']:.3f} us {name}") + lines.append("") + + Path(args.out).write_text("\n".join(lines), encoding="utf-8") + print(f"[OK] wrote: {args.out}") + + +if __name__ == "__main__": + main() + diff --git a/profile/analyze_trace_gemm_shapes.py b/profile/analyze_trace_gemm_shapes.py new file mode 100644 index 0000000..98a0a0d --- /dev/null +++ b/profile/analyze_trace_gemm_shapes.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +""" +Stream-parse PyTorch chrome trace JSON (very large) and aggregate GEMM shape +distributions for selected ops. + +This script is designed for traces exported with record_shapes=True, where op +events contain args["Input Dims"]. + +Example: + python profile/analyze_trace_gemm_shapes.py \ + --trace log/torch_profiles/20260125_023133/pytorch_trace_diffulex.generate(profiled).json \ + --out log/torch_profiles/20260125_023133/gemm_shapes_bf16.txt \ + --ops aten::mm aten::addmm +""" + +from __future__ import annotations + +import argparse +import json +import math +from collections import Counter, defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Tuple + + +def _parse_json_value_fragment(fragment: str) -> Any: + # fragment: after ':' in a JSON line, possibly ending with ',' and newline. + frag = fragment.strip() + if frag.endswith(","): + frag = frag[:-1] + return json.loads(frag) + +def _extract_json_array_after_key(line: str, key: str) -> Optional[Any]: + """ + Extract and json-load the array value after `"key":` from a possibly + multi-field JSON line, e.g. + ..."Input Dims": [[1,2],[3,4]], "Ev Idx": 5 + """ + k = f"\"{key}\"" + pos = line.find(k) + if pos < 0: + return None + colon = line.find(":", pos + len(k)) + if colon < 0: + return None + # Find the first '[' after the colon. + start = line.find("[", colon) + if start < 0: + return None + depth = 0 + end = -1 + for i in range(start, len(line)): + ch = line[i] + if ch == "[": + depth += 1 + elif ch == "]": + depth -= 1 + if depth == 0: + end = i + 1 + break + if end < 0: + return None + frag = line[start:end] + try: + return json.loads(frag) + except Exception: + return None + + +def _extract_quoted_value(line: str) -> Optional[str]: + # very small helper: extract first "...". + i = line.find('"') + if i < 0: + return None + j = line.find('"', i + 1) + if j < 0: + return None + return line[i + 1 : j] + + +def _extract_number_after_colon(line: str) -> Optional[float]: + # e.g. "dur": 123.0, + if ":" not in line: + return None + frag = line.split(":", 1)[1].strip() + if frag.endswith(","): + frag = frag[:-1] + try: + return float(frag) + except Exception: + return None + +def _extract_number_after_key(line: str, key: str) -> Optional[float]: + """ + Extract a numeric value after `"key":` from a possibly multi-field JSON line, e.g. + "ts": 123.0, "dur": 34.5, + """ + k = f"\"{key}\"" + pos = line.find(k) + if pos < 0: + return None + colon = line.find(":", pos + len(k)) + if colon < 0: + return None + frag = line[colon + 1 :].strip() + # Cut at next comma if present. + comma = frag.find(",") + if comma >= 0: + frag = frag[:comma] + try: + return float(frag.strip()) + except Exception: + return None + + +def _dims_to_mnk(input_dims: Any) -> Optional[Tuple[int, int, int]]: + """ + Convert args["Input Dims"] into a best-effort (M,N,K). + input_dims is typically a list where each element is [] (non-tensor) or + a list[int] (tensor dims). + """ + if not isinstance(input_dims, list): + return None + + tensor_dims: List[List[int]] = [] + for d in input_dims: + if isinstance(d, list) and len(d) >= 2 and all(isinstance(x, (int, float)) for x in d): + tensor_dims.append([int(x) for x in d]) + if len(tensor_dims) < 2: + return None + + a = tensor_dims[0] + b = tensor_dims[1] + a_m, a_k = a[-2], a[-1] + # b could be [k, n] or [n, k] depending on transpose convention. + if len(b) >= 2 and a_k == b[-2]: + b_k, b_n = b[-2], b[-1] + return (a_m, b_n, a_k) + if len(b) >= 2 and a_k == b[-1]: + # b is [n, k] + b_n, b_k = b[-2], b[-1] + return (a_m, b_n, a_k) + + # fallback: assume [k, n] + return (a_m, b[-1], a_k) + + +@dataclass +class ShapeStats: + calls: int = 0 + dur_us: float = 0.0 + + +def iter_op_events(trace_path: Path, target_ops: set[str]) -> Iterable[Tuple[str, Optional[float], Any]]: + """ + Yields (op_name, dur_us, input_dims) for events whose "name" is in target_ops. + Streaming + brace-depth parsing to avoid loading giant JSON into memory. + """ + in_trace_events = False + in_event = False + depth = 0 + + name: Optional[str] = None + dur: Optional[float] = None + input_dims: Any = None + want = False + + with trace_path.open("r", encoding="utf-8", errors="replace") as f: + for line in f: + if not in_trace_events: + if '"traceEvents"' in line and "[" in line: + in_trace_events = True + continue + + # Start of a JSON object event in traceEvents list. + if not in_event: + stripped = line.lstrip() + if stripped.startswith("{"): + in_event = True + depth = stripped.count("{") - stripped.count("}") + name = None + dur = None + input_dims = None + want = False + else: + # End of traceEvents list. + if line.lstrip().startswith("]"): + break + continue + else: + depth += line.count("{") - line.count("}") + + # Parse fields we care about. + if '"name"' in line: + # Some traces put multiple fields on one line: + # "ph": "X", "cat": "cpu_op", "name": "aten::mm", ... + key = '"name":' + pos = line.find(key) + if pos >= 0: + q1 = line.find('"', pos + len(key)) + if q1 >= 0: + q2 = line.find('"', q1 + 1) + if q2 >= 0: + name = line[q1 + 1 : q2] + want = name in target_ops + + if want and dur is None and '"dur"' in line: + dur = _extract_number_after_key(line, "dur") + + if want and input_dims is None and "Input Dims" in line: + input_dims = _extract_json_array_after_key(line, "Input Dims") + + # End of current event object (also works for single-line events). + if in_event and depth <= 0: + if want and name is not None: + yield (name, dur, input_dims) + in_event = False + + +def _human_int(n: float) -> str: + if n >= 1e9: + return f"{n/1e9:.3f}B" + if n >= 1e6: + return f"{n/1e6:.3f}M" + if n >= 1e3: + return f"{n/1e3:.3f}K" + return f"{int(n)}" + + +def main() -> None: + ap = argparse.ArgumentParser("Aggregate GEMM shapes from huge torch chrome trace") + ap.add_argument("--trace", type=str, required=True, help="Path to pytorch_trace_*.json") + ap.add_argument("--out", type=str, required=True, help="Output report path") + ap.add_argument("--ops", type=str, nargs="+", default=["aten::mm", "aten::addmm"], help="Op names to aggregate") + ap.add_argument("--topk", type=int, default=30) + args = ap.parse_args() + + trace_path = Path(args.trace) + out_path = Path(args.out) + target_ops = set(args.ops) + + # op -> (mnk -> stats) + agg: Dict[str, Dict[Tuple[int, int, int], ShapeStats]] = defaultdict(dict) + op_totals: Dict[str, ShapeStats] = defaultdict(ShapeStats) + op_unknown: Counter[str] = Counter() + + for op, dur_us, input_dims in iter_op_events(trace_path, target_ops): + op_totals[op].calls += 1 + if dur_us is not None: + op_totals[op].dur_us += dur_us + + mnk = _dims_to_mnk(input_dims) + if mnk is None: + op_unknown[op] += 1 + continue + + st = agg[op].get(mnk) + if st is None: + st = ShapeStats() + agg[op][mnk] = st + st.calls += 1 + if dur_us is not None: + st.dur_us += dur_us + + lines: List[str] = [] + lines.append(f"Trace: {trace_path}") + lines.append(f"Ops: {', '.join(sorted(target_ops))}") + lines.append("") + + for op in sorted(target_ops): + tot = op_totals.get(op, ShapeStats()) + lines.append(f"== {op} ==") + lines.append(f"total calls: {tot.calls}") + lines.append(f"total dur(us): {tot.dur_us:.3f}") + lines.append(f"unknown shapes: {op_unknown.get(op, 0)}") + lines.append("") + + if op not in agg or not agg[op]: + lines.append("(no shape stats)\n") + continue + + # Top by total dur + items = list(agg[op].items()) + items_by_dur = sorted(items, key=lambda kv: kv[1].dur_us, reverse=True)[: args.topk] + lines.append(f"-- top {args.topk} shapes by total dur(us) --") + lines.append("M,N,K calls total_dur(us) approx_GFLOP") + for (m, n, k), st in items_by_dur: + gflop = 2.0 * m * n * k / 1e9 + lines.append(f"{m},{n},{k} {st.calls} {st.dur_us:.3f} {gflop:.3f}") + lines.append("") + + # Top by calls + items_by_calls = sorted(items, key=lambda kv: kv[1].calls, reverse=True)[: args.topk] + lines.append(f"-- top {args.topk} shapes by calls --") + lines.append("M,N,K calls total_dur(us) avg_dur(us)") + for (m, n, k), st in items_by_calls: + avg = st.dur_us / st.calls if st.calls else 0.0 + lines.append(f"{m},{n},{k} {st.calls} {st.dur_us:.3f} {avg:.3f}") + lines.append("") + + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text("\n".join(lines), encoding="utf-8") + print(f"[OK] wrote: {out_path}") + + +if __name__ == "__main__": + main() + diff --git a/test/python/kernel/test_paged_attn_decode_triton.py b/test/python/kernel/test_paged_attn_decode_triton.py new file mode 100644 index 0000000..055dece --- /dev/null +++ b/test/python/kernel/test_paged_attn_decode_triton.py @@ -0,0 +1,240 @@ +import pytest +import torch +import torch.nn.functional as F + +from einops import rearrange +from types import SimpleNamespace + +from diffulex_kernel.python.paged_attn_decode_triton import paged_attn_decode_unified_triton + + +def _has_fp8() -> bool: + return hasattr(torch, "float8_e4m3fn") or hasattr(torch, "float8_e4m3fnuz") or hasattr(torch, "float8_e5m2") + + +def _build_cu_seqlens(lengths: torch.Tensor) -> torch.Tensor: + # lengths: [num_seqs] int32 on cuda + return torch.tensor( + [0] + list(torch.cumsum(lengths, dim=0).cpu().numpy()), + dtype=torch.int32, + device=lengths.device, + ) + + +def naive_sdpa_with_kvcache( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + cu_seqlens_q: torch.Tensor, + cu_seqlens_k: torch.Tensor, + scale: float, + num_groups: int, + page_block_size: int, +) -> torch.Tensor: + num_seqs = len(cu_seqlens_q) - 1 + output = torch.zeros_like(q) + for seq_idx in range(num_seqs): + q_start = int(cu_seqlens_q[seq_idx].item()) + q_end = int(cu_seqlens_q[seq_idx + 1].item()) + kv_start = int(cu_seqlens_k[seq_idx].item()) + kv_end = int(cu_seqlens_k[seq_idx + 1].item()) + + q_seq = q[q_start:q_end] # [q_len, Hq, D] + k_seq = k[kv_start:kv_end] # [new_len, Hkv, D] + v_seq = v[kv_start:kv_end] + + ctx = int(context_lens[seq_idx].item()) + k_cache_seq_list = [] + v_cache_seq_list = [] + for blk in range(block_tables.shape[1]): + page = int(block_tables[seq_idx, blk].item()) + if page < 0: + continue + blk_start = blk * page_block_size + if blk_start >= ctx: + continue + blk_end = min(blk_start + page_block_size, ctx) + n = blk_end - blk_start + k_cache_seq_list.append(k_cache[page, :n]) + v_cache_seq_list.append(v_cache[page, :n]) + + if k_cache_seq_list: + k_ctx = torch.cat(k_cache_seq_list, dim=0) + v_ctx = torch.cat(v_cache_seq_list, dim=0) + k_comb = torch.cat([k_ctx, k_seq], dim=0) + v_comb = torch.cat([v_ctx, v_seq], dim=0) + else: + k_comb = k_seq + v_comb = v_seq + + q_sdpa = rearrange(q_seq, "s h d -> 1 h s d") + k_sdpa = rearrange(k_comb, "s h d -> 1 h s d") + v_sdpa = rearrange(v_comb, "s h d -> 1 h s d") + attn_out = F.scaled_dot_product_attention( + q_sdpa, + k_sdpa, + v_sdpa, + dropout_p=0.0, + is_causal=False, + scale=scale, + enable_gqa=True, + ) + output[q_start:q_end] = rearrange(attn_out, "1 h s d -> s h d").to(output.dtype) + + return output + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for Triton paged-attention kernel") +def test_paged_decode_triton_bf16_cache_matches_reference(): + torch.manual_seed(0) + device = torch.device("cuda") + + num_seqs = 4 + num_heads = 32 + num_kv_heads = 8 + head_dim = 128 + page_size = 32 + diffusion_block_size = 32 + + num_groups = num_heads // num_kv_heads + + # Per-seq query/new KV length (decode step) + q_lens = torch.full((num_seqs,), diffusion_block_size, dtype=torch.int32, device=device) + cu_q = _build_cu_seqlens(q_lens) + cu_k = cu_q.clone() + total_q = int(cu_q[-1].item()) + + # Context lengths (vary per seq) + context_lens = torch.tensor([0, 17, 63, 128], dtype=torch.int32, device=device) + max_ctx = int(context_lens.max().item()) + max_seq_blocks = (max_ctx + page_size - 1) // page_size + num_page_blocks = num_seqs * max_seq_blocks + + # Assign each seq its own contiguous pages + block_tables = torch.full((num_seqs, max_seq_blocks), -1, dtype=torch.int32, device=device) + for s in range(num_seqs): + for b in range(max_seq_blocks): + block_tables[s, b] = s * max_seq_blocks + b + + q = torch.randn((total_q, num_heads, head_dim), device=device, dtype=torch.bfloat16) + k = torch.randn((total_q, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16) + v = torch.randn_like(k) + + k_cache = torch.randn((num_page_blocks, page_size, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16) + v_cache = torch.randn_like(k_cache) + + md = SimpleNamespace( + kv_cache_layout="unified", + block_tables=block_tables, + context_lens=context_lens, + cu_seqlens_q=cu_q, + max_seqlen_q=int(q_lens.max().item()), + page_block_size=page_size, + ) + scale = 1.0 / (head_dim**0.5) + + out = paged_attn_decode_unified_triton(q, k, v, k_cache, v_cache, md, softmax_scale=scale, fp8_cache=False) + ref = naive_sdpa_with_kvcache( + q, + k, + v, + k_cache, + v_cache, + block_tables, + context_lens, + cu_q, + cu_k, + scale, + num_groups, + page_size, + ) + + torch.testing.assert_close(out, ref, atol=1e-2, rtol=1e-2) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for Triton paged-attention kernel") +@pytest.mark.skipif(not _has_fp8(), reason="This torch build does not expose FP8 dtypes") +def test_paged_decode_triton_fp8_cache_matches_reference(): + torch.manual_seed(0) + device = torch.device("cuda") + + fp8_dtype = torch.float8_e4m3fn if hasattr(torch, "float8_e4m3fn") else torch.float8_e5m2 + + num_seqs = 2 + num_heads = 16 + num_kv_heads = 4 + head_dim = 128 + page_size = 32 + diffusion_block_size = 32 + num_groups = num_heads // num_kv_heads + + q_lens = torch.full((num_seqs,), diffusion_block_size, dtype=torch.int32, device=device) + cu_q = _build_cu_seqlens(q_lens) + cu_k = cu_q.clone() + total_q = int(cu_q[-1].item()) + + context_lens = torch.tensor([37, 55], dtype=torch.int32, device=device) + max_ctx = int(context_lens.max().item()) + max_seq_blocks = (max_ctx + page_size - 1) // page_size + num_page_blocks = num_seqs * max_seq_blocks + block_tables = torch.full((num_seqs, max_seq_blocks), -1, dtype=torch.int32, device=device) + for s in range(num_seqs): + for b in range(max_seq_blocks): + block_tables[s, b] = s * max_seq_blocks + b + + q = torch.randn((total_q, num_heads, head_dim), device=device, dtype=torch.bfloat16) + k = torch.randn((total_q, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16) + v = torch.randn_like(k) + + # Build BF16 "true" cache values, then quantize to FP8 as (x / scale) -> fp8, with per-head scales. + k_cache_true = torch.randn((num_page_blocks, page_size, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16) * 0.5 + v_cache_true = torch.randn_like(k_cache_true) * 0.5 + + eps = 1e-6 + k_absmax = k_cache_true.to(torch.float32).abs().amax(dim=(0, 1, 3)) + v_absmax = v_cache_true.to(torch.float32).abs().amax(dim=(0, 1, 3)) + fp8_max = 448.0 if fp8_dtype == torch.float8_e4m3fn else 57344.0 + k_scale = (k_absmax / fp8_max).clamp_min(eps).to(torch.float32) + v_scale = (v_absmax / fp8_max).clamp_min(eps).to(torch.float32) + + k_cache_fp8 = (k_cache_true.to(torch.float32) / k_scale.view(1, 1, -1, 1)).to(fp8_dtype) + v_cache_fp8 = (v_cache_true.to(torch.float32) / v_scale.view(1, 1, -1, 1)).to(fp8_dtype) + + md = SimpleNamespace( + kv_cache_layout="unified", + block_tables=block_tables, + context_lens=context_lens, + cu_seqlens_q=cu_q, + max_seqlen_q=int(q_lens.max().item()), + page_block_size=page_size, + k_scale=k_scale, + v_scale=v_scale, + ) + scale = 1.0 / (head_dim**0.5) + + out = paged_attn_decode_unified_triton(q, k, v, k_cache_fp8, v_cache_fp8, md, softmax_scale=scale, fp8_cache=True) + + # Reference uses dequantized cache. + k_cache_deq = (k_cache_fp8.float() * k_scale.view(1, 1, -1, 1)).to(torch.bfloat16) + v_cache_deq = (v_cache_fp8.float() * v_scale.view(1, 1, -1, 1)).to(torch.bfloat16) + ref = naive_sdpa_with_kvcache( + q, + k, + v, + k_cache_deq, + v_cache_deq, + block_tables, + context_lens, + cu_q, + cu_k, + scale, + num_groups, + page_size, + ) + + torch.testing.assert_close(out, ref, atol=2e-2, rtol=2e-2) + From f6d0fa296e48eae387b63f16b1528061c5e7a877 Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Sun, 25 Jan 2026 13:36:40 +0000 Subject: [PATCH 07/10] refactor: remove CUDA Graph blockers and simplify linear quantization strategies - Remove all .item() calls in LinearBase hot paths (GPU->CPU sync breaks graph capture) - Add Python-side meta cache (_offline_quant_*_py, _gptq_is_shuffled_py, etc.) - Use in-place fill_() + Python mirrors for state updates - Simplify linear quantization strategies for future CUDA Graph support - Remove fast_path checks and redundant branching in linear_marlin_int8_w8a16 - Remove fast_path in linear_int8_w8a8 (unified vLLM path) - Simplify linear_gptq_w4a16 (direct torch.ops._C.gptq_gemm call) - Make linear_fp8_w8a16 use explicit quant_scales parameter - Fix FP8 weight layout: do not force contiguous for transpose-view (KxN stride0==1) - Remove profiler record_function wrappers (graph-friendly) Net: -129 lines, cleaner codebase ready for CUDA Graph capture --- diffulex/layer/linear.py | 91 +++++--- .../strategies/linear_fp8_w8a16.py | 8 +- .../strategies/linear_gptq_w4a16.py | 57 ++--- .../strategies/linear_int8_w8a8.py | 53 +---- .../strategies/linear_marlin_int8_w8a16.py | 217 +++++------------- profile/torch_d2f_profiler.py | 5 + 6 files changed, 151 insertions(+), 280 deletions(-) diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py index e3581e9..5cc4b6d 100755 --- a/diffulex/layer/linear.py +++ b/diffulex/layer/linear.py @@ -134,19 +134,31 @@ def __init__( self.register_buffer("awq_marlin_zp", torch.empty(0, dtype=torch.int32), persistent=False) self.register_buffer("awq_marlin_workspace", torch.empty(0, dtype=torch.int32), persistent=False) + # ---- Python-side meta cache (CUDA Graph friendly) ---- + # Avoid `.item()` on CUDA tensors in hot paths (it introduces GPU->CPU sync and breaks graph capture). + self._weight_is_quantized_py: bool = False + # 0=none, 1=gptq, 2=awq + self._offline_quant_format_py: int = 0 + self._offline_quant_bits_py: int = 0 + self._offline_quant_group_size_py: int = 128 + self._offline_quant_out_features_py: int = 0 + self._offline_quant_in_features_py: int = 0 + self._gptq_is_shuffled_py: bool = False + self._gptq_marlin_is_prepared_py: bool = False + self._awq_marlin_is_prepared_py: bool = False + def has_quantized_weight(self) -> bool: - return bool(self._weight_is_quantized.item()) and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0 + return self._weight_is_quantized_py and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0 def has_offline_quantized_weight(self) -> bool: """Check if offline quantized weights (GPTQ/AWQ) are present.""" - format_val = int(self._offline_quant_format.item()) if self._offline_quant_format.numel() > 0 else 0 - if format_val == 1: # GPTQ + if self._offline_quant_format_py == 1: # GPTQ return ( self.gptq_qweight.numel() > 0 and self.gptq_qzeros.numel() > 0 and self.gptq_scales.numel() > 0 ) - elif format_val == 2: # AWQ + elif self._offline_quant_format_py == 2: # AWQ return ( self.awq_qweight.numel() > 0 and self.awq_qzeros.numel() > 0 @@ -224,6 +236,8 @@ def _infer_module_device() -> torch.device: bits = 32 // pack_factor if format == "awq" and bits != 4: raise ValueError(f"AWQ 目前仅支持 4-bit(pack_factor=8),当前推断 bits={bits} (pack_factor={pack_factor})") + # Cache meta as Python primitives (graph-friendly). + self._offline_quant_bits_py = int(bits) # Record bits for downstream kernels (esp. marlin path). self._offline_quant_bits = torch.tensor(bits, dtype=torch.int32, device=module_device) @@ -305,6 +319,8 @@ def _infer_module_device() -> torch.device: self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) self._offline_quant_format = torch.tensor(1, dtype=torch.int8, device=module_device) self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device) + self._offline_quant_format_py = 1 + self._gptq_is_shuffled_py = False else: # AWQ self.awq_qweight = qweight self.awq_qzeros = qzeros @@ -316,6 +332,8 @@ def _infer_module_device() -> torch.device: self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device) self._offline_quant_format = torch.tensor(2, dtype=torch.int8, device=module_device) self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device) + self._offline_quant_format_py = 2 + self._gptq_is_shuffled_py = False # Reset marlin-prep caches (weights may have changed / moved). self._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device) @@ -334,6 +352,12 @@ def _infer_module_device() -> torch.device: self._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32, device=module_device) self._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device) self._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device) + # Python meta mirrors. + self._offline_quant_group_size_py = int(group_size) + self._offline_quant_out_features_py = int(out_features) + self._offline_quant_in_features_py = int(in_features) + self._gptq_marlin_is_prepared_py = False + self._awq_marlin_is_prepared_py = False # Drop bf16 weight Parameter if present (to free memory) if "weight" in self._parameters: @@ -342,13 +366,11 @@ def _infer_module_device() -> torch.device: def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None: """Prepare vLLM GPTQ weights on first use (required gptq_shuffle).""" - if self._offline_quant_format.numel() == 0: - return - if int(self._offline_quant_format.item()) != 1: + if self._offline_quant_format_py != 1: return if self.gptq_qweight.numel() == 0: return - if self._gptq_is_shuffled.numel() > 0 and bool(self._gptq_is_shuffled.item()): + if self._gptq_is_shuffled_py: return # Lazy import to avoid pulling vLLM unless GPTQ offline weights are used. @@ -373,7 +395,7 @@ def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None: # Infer weight_bits from packed qweight shape to support GPTQ W2/W4/W8. # qweight: [K/pack_factor, N], where pack_factor = 32 / weight_bits. - in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else None + in_features = int(self._offline_quant_in_features_py) if in_features is None or in_features <= 0: raise RuntimeError("GPTQ offline 权重已加载,但无法推断 in_features 以计算 weight_bits。") if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0: @@ -389,20 +411,20 @@ def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None: ) weight_bits = 32 // pack_factor ops.gptq_shuffle(self.gptq_qweight, g_idx, weight_bits) - self._gptq_is_shuffled = torch.tensor(True, dtype=torch.bool, device=x.device) + # Do NOT create new tensors on hot paths; update in-place + python mirror. + self._gptq_is_shuffled.fill_(True) + self._gptq_is_shuffled_py = True def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: """Prepare vLLM GPTQ Marlin weights on first use (repack + permute scales/zp). IMPORTANT: This path must NOT call `gptq_shuffle` (that is specific to gptq_gemm/exllama). """ - if self._offline_quant_format.numel() == 0: - return - if int(self._offline_quant_format.item()) != 1: + if self._offline_quant_format_py != 1: return if self.gptq_qweight.numel() == 0: return - if self._gptq_marlin_is_prepared.numel() > 0 and bool(self._gptq_marlin_is_prepared.item()): + if self._gptq_marlin_is_prepared_py: return try: @@ -425,9 +447,9 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: "请确保模型与输入在同一设备。" ) - in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0 - out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0 - group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128 + in_features = int(self._offline_quant_in_features_py) + out_features = int(self._offline_quant_out_features_py) + group_size = int(self._offline_quant_group_size_py) if in_features <= 0 or out_features <= 0: raise RuntimeError( f"GPTQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}" @@ -436,7 +458,7 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: # Determine weight_bits. # - Standard GPTQ layout: infer from qweight K packing. # - Marlin-exported layout: bits cannot be inferred from qweight shape; use recorded bits. - weight_bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 + weight_bits = int(self._offline_quant_bits_py) if weight_bits <= 0: if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0: raise RuntimeError( @@ -503,17 +525,16 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None: # Use empty zp to keep has_zp=False in the kernel. self.gptq_marlin_zp = marlin_make_empty_g_idx(device) - self._gptq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device) + self._gptq_marlin_is_prepared.fill_(True) + self._gptq_marlin_is_prepared_py = True def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None: """Prepare vLLM AWQ Marlin weights on first use (repack + permute scales/zp).""" - if self._offline_quant_format.numel() == 0: - return - if int(self._offline_quant_format.item()) != 2: + if self._offline_quant_format_py != 2: return if self.awq_qweight.numel() == 0: return - if self._awq_marlin_is_prepared.numel() > 0 and bool(self._awq_marlin_is_prepared.item()): + if self._awq_marlin_is_prepared_py: return try: @@ -535,9 +556,9 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None: "请确保模型与输入在同一设备。" ) - in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0 - out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0 - group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128 + in_features = int(self._offline_quant_in_features_py) + out_features = int(self._offline_quant_out_features_py) + group_size = int(self._offline_quant_group_size_py) if in_features <= 0 or out_features <= 0: raise RuntimeError( f"AWQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}" @@ -579,7 +600,8 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None: is_a_8bit=False, ).contiguous() - self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device) + self._awq_marlin_is_prepared.fill_(True) + self._awq_marlin_is_prepared_py = True def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: torch.Tensor) -> None: # Support: @@ -617,6 +639,8 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to # FP8 W8A16 uses float32 scales if weight_format in ("fp8_e4m3", "fp8_e5m2") and act_format == "bf16": scale_dtype = torch.float32 + # Keep KxN transpose-view layout (do NOT force contiguous) for vLLM FP8 kernels. + force_weight_contig = False # W8A8 int8 uses float32 [1, N] weight scales in vLLM cutlass_scaled_mm path. elif weight_format == "int8" and act_format == "int8": scale_dtype = torch.float32 @@ -644,6 +668,7 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to # 1xN view for fused kernels expecting 2D scales. self.quant_scales_1xn = quant_scales if quant_scales.dim() == 2 else quant_scales.view(1, -1) self._weight_is_quantized.fill_(True) + self._weight_is_quantized_py = True def _maybe_promote_weight_to_quantized_at_runtime( self, @@ -744,9 +769,9 @@ def _get_linear_strategy(self): def _offline_meta(self) -> tuple[int, int, int]: """Return (out_features, in_features, group_size) for offline GPTQ/AWQ.""" return ( - int(self._offline_quant_out_features.item()), - int(self._offline_quant_in_features.item()), - int(self._offline_quant_group_size.item()), + int(self._offline_quant_out_features_py), + int(self._offline_quant_in_features_py), + int(self._offline_quant_group_size_py), ) def _infer_gptq_weight_bits(self, *, in_features: int) -> int: @@ -756,7 +781,7 @@ def _infer_gptq_weight_bits(self, *, in_features: int) -> int: - use recorded bits (e.g., marlin-exported layouts), - otherwise infer from qweight packing. """ - bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0 + bits = int(self._offline_quant_bits_py) if bits > 0: return bits if self.gptq_qweight.numel() == 0: @@ -783,7 +808,7 @@ def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict: if strategy is None: raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") - format_val = int(self._offline_quant_format.item()) + format_val = int(self._offline_quant_format_py) weight_format = getattr(strategy, "linear_weight_format", None) out_features, in_features, group_size = self._offline_meta() @@ -887,7 +912,7 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch. if weight_format == "awq": # AWQ is 4-bit only in vLLM; bits stored in _offline_quant_bits. - bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 4 + bits = int(self._offline_quant_bits_py) if int(self._offline_quant_bits_py) > 0 else 4 pack_factor = 32 // max(1, bits) return strategy.linear_forward( x, diff --git a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py index 85048d8..b25cf99 100644 --- a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py +++ b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py @@ -108,12 +108,10 @@ def linear_forward( bias: Optional[torch.Tensor], *, quant_kind: str, - **kwargs: Any, + quant_scales: Optional[torch.Tensor] = None, + out_features: Optional[int] = None, ) -> torch.Tensor: - _ = quant_kind - from vllm.platforms import current_platform # type: ignore - - quant_scales = kwargs.get("quant_scales", None) + _ = quant_kind, out_features if weight is not None and quant_scales is not None: # Expected: weight is fp8 K×N tensor (transpose-view is fine). q_kn = weight.to(device=x.device) diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py index f0a7a98..95e5b9e 100644 --- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py @@ -112,65 +112,34 @@ def linear_forward( raise RuntimeError("GPTQ offline weights missing packed tensors and bf16 weight is not present.") return F.linear(x, weight, bias) + if weight_bits <= 0: + raise RuntimeError("GPTQ requires explicit weight_bits (>0) for the CUDA kernel path.") + # vLLM GPTQ kernels expect FP16 activations. x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16) + x2 = x_in.reshape(-1, x_in.shape[-1]) if x_in.dim() != 2 else x_in + if not x2.is_contiguous(): + x2 = x2.contiguous() - # ---- Fast path ---- - if ( - x_in.dim() == 2 - and x_in.is_contiguous() - and qweight.device == x.device - and qzeros.device == x.device - and scales.device == x.device - and qweight.dtype == torch.int32 - and qzeros.dtype == torch.int32 - and scales.dtype == torch.float16 - and qweight.is_contiguous() - and qzeros.is_contiguous() - and scales.is_contiguous() - and weight_bits > 0 - ): - if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): - g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int) - else: - # Prefer already-correct dtype/device to avoid per-call copies. - g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int) - n = int(out_features) if out_features is not None else int(qweight.shape[-1]) - output = torch.ops._C.gptq_gemm( - x_in, - qweight, - qzeros, - scales, - g_idx_t, - True, - bool(use_v2_format), - int(weight_bits), - ) - if bias is not None: - output.add_(bias.to(dtype=output.dtype)) - # Output is [M,N] - return output.to(dtype=x.dtype) if output.dtype != x.dtype else output - - out_shape = x.shape[:-1] + (int(out_features) if out_features is not None else int(qweight.shape[-1]),) - reshaped_x = x_in.reshape(-1, x_in.shape[-1]) - if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0): + if g_idx is None or g_idx.numel() == 0: g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int) else: - g_idx_t = g_idx.to(device=x.device, dtype=torch.int) + g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int) - output = ops.gptq_gemm( - reshaped_x, + output = torch.ops._C.gptq_gemm( + x2, qweight, qzeros, scales, g_idx_t, True, # use_exllama bool(use_v2_format), - int(weight_bits) if weight_bits > 0 else 4, + int(weight_bits), ) if bias is not None: output.add_(bias.to(dtype=output.dtype)) + + out_shape = x.shape[:-1] + (int(out_features) if out_features is not None else int(qweight.shape[-1]),) output = output.reshape(out_shape) - # Keep output dtype consistent with input activations for downstream layers. return output.to(dtype=x.dtype) if output.dtype != x.dtype else output diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py index ae62b64..ba07440 100644 --- a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py +++ b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py @@ -38,12 +38,6 @@ def __init__(self) -> None: super().__init__() # Cache: id(weight) -> (qweight_int8 [N,K], w_scales_fp32 [N]) self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} - self._ops_available: bool = bool( - _vllm_ops is not None - and hasattr(torch.ops, "_C") - and hasattr(torch.ops._C, "dynamic_scaled_int8_quant") - and hasattr(torch.ops._C, "cutlass_scaled_mm") - ) @property def name(self) -> str: @@ -115,44 +109,13 @@ def linear_forward( out_features: Optional[int] = None, ) -> torch.Tensor: _ = quant_kind + if _vllm_ops is None: + raise RuntimeError("vLLM custom ops are required for W8A8 (scaled_int8_quant / cutlass_scaled_mm).") - # ---- Fast path (decode hot path) ---- - # Preconditions are strict to minimize Python overhead. - # Expect: - # - qweight: int8 KxN with stride(0)==1 - # - w_scales: float32 [1,N], contiguous - if ( - self._ops_available - and _vllm_ops is not None - and x.dim() == 2 - and x.device.type == "cuda" - and x.dtype in (torch.bfloat16, torch.float16) - and x.is_contiguous() - and weight is not None - and weight.dtype == torch.int8 - and weight.device == x.device - and weight.stride(0) == 1 - and quant_scales is not None - and quant_scales.device == x.device - and quant_scales.dtype == torch.float32 - and quant_scales.dim() == 2 - and quant_scales.is_contiguous() - ): - m, _k = x.shape - # Optionally validate N to catch wrong metadata early. - if out_features is None or int(out_features) == int(quant_scales.shape[1]): - x_q = torch.empty((m, _k), device=x.device, dtype=torch.int8) - x_s = torch.empty((m, 1), device=x.device, dtype=torch.float32) - torch.ops._C.dynamic_scaled_int8_quant(x_q, x, x_s, None) - out = torch.empty((m, int(quant_scales.shape[1])), device=x.device, dtype=x.dtype) - torch.ops._C.cutlass_scaled_mm(out, x_q, weight, x_s, quant_scales, bias) - return out - - # If weight already quantized by LinearBase.load-time quantization. + # Weight/scales: prefer load-time quantized buffers. if weight is not None and weight.dtype == torch.int8 and quant_scales is not None: - # Expected: qweight is K×N int8 (may be non-contiguous), quant_scales is [1,N] fp32 qweight = weight - w_scales = quant_scales.to(dtype=torch.float32) + w_scales = quant_scales else: wid = id(weight) cached = self._weight_cache.get(wid) @@ -164,13 +127,15 @@ def linear_forward( else: qweight, w_scales = cached - # Flatten like torch.nn.functional.linear orig_shape = x.shape x2 = x.reshape(-1, x.shape[-1]) if x.dim() != 2 else x if x2.dtype not in (torch.bfloat16, torch.float16): x2 = x2.to(torch.bfloat16) - # dynamic per-token int8 quant + fused GEMM_DQ - x_q, x_s, _ = _vllm_ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True) + if not x2.is_contiguous(): + x2 = x2.contiguous() + + # dynamic per-token int8 quant + fused GEMM+dequant + x_q, x_s, _ = _vllm_ops.scaled_int8_quant(x2, scale=None, azp=None, symmetric=True) y = _vllm_ops.cutlass_scaled_mm( x_q, qweight, diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py index fe99904..c2ff1ce 100644 --- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py +++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py @@ -35,15 +35,6 @@ def _allspark_is_available() -> bool: and hasattr(_vllm_ops, "allspark_repack_weight") ) - -def _allspark_w8a16_gemm(*args, **kwargs): - if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_w8a16_gemm"): - raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_w8a16_gemm`.") - # Narrow profiler range to isolate Python wrapper overhead vs kernel time. - with torch.profiler.record_function("w8a16/allspark_w8a16_gemm(pybind)"): - return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs) - - def _allspark_repack_weight(b_qweight_kn: torch.Tensor, scales_1xn: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """Repack KxN uint8 qweight + 1xN scales into (N_32,K) + (1,N_32) for AllSpark GEMM.""" if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_repack_weight"): @@ -262,158 +253,76 @@ def linear_forward( out_features: Optional[int] = None, ) -> torch.Tensor: _ = quant_kind - - # ---- Fast path (decode hot path) ---- - # Goal: make Python-side overhead close to a single custom-op call (+ optional bias add). - # Preconditions are intentionally strict; otherwise we fall back to the fully-checked path. - # - # Notes: - # - We call `_vllm_ops.allspark_w8a16_gemm` directly to avoid extra Python wrapper overhead. - # - We require `quant_scales` already in 1xN contiguous layout (LinearBase provides this). - if ( - self._allspark_available - and _vllm_ops is not None - and x.dim() == 2 - and x.device.type == "cuda" - and x.dtype == torch.bfloat16 - and x.is_contiguous() - and weight is not None - and weight.dtype in (torch.uint8, torch.int8) - and weight.is_contiguous() - and quant_scales is not None - and quant_scales.dim() == 2 - and quant_scales.is_contiguous() - and out_features is not None - ): - # Minimal shape checks (avoid slow/branchy fallback). - m, k = x.shape - n_32, k_w = weight.shape - if k_w == k and (k & 15) == 0 and 0 < int(out_features) <= int(n_32): - sm_count, sm_version = self._get_sm_info(x.device) - y = _vllm_ops.allspark_w8a16_gemm( - x, - weight, - quant_scales, - None, # b_qzeros - int(out_features), - -1, # group_size (only supports -1) - sm_count, - sm_version, - self._cublas_m_thr, - False, # has_zp - True, # n32k16_reorder - ) - if bias is not None: - y = y + bias - return y - - # Handle >2D like torch.nn.functional.linear: flatten then reshape back. - with torch.profiler.record_function("w8a16/reshape_input"): - orig_shape = x.shape - if x.dim() == 1: - x2 = x.unsqueeze(0) - elif x.dim() == 2: - x2 = x - else: - x2 = x.reshape(-1, x.shape[-1]) - - # Load-time quantized module path: weight is uint8/int8 buffer and scales provided. - with torch.profiler.record_function("w8a16/select_qweight_scales"): - if weight is not None and weight.dtype in (torch.uint8, torch.int8): - if quant_scales is None: - raise ValueError("quant_scales is required when weight is quantized") - qweight = weight - scales = quant_scales - else: - # Lazy cache for bf16 weights (not expected in steady-state, but keep for safety). - weight_id = id(weight) - cached = self._weight_cache.get(weight_id) - if cached is None or cached[0].device != x2.device: - qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device) - self._weight_cache[weight_id] = (qweight, scales) - else: - qweight, scales = cached - - # If fused kernel isn't available, fall back to BF16 only if original weight exists; - # otherwise fail fast (do NOT dequantize a full matrix, which is memory-prohibitive). - if not self._allspark_available: + if not self._allspark_available or _vllm_ops is None: + # correctness fallback only when bf16 weight exists if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16): return F.linear(x, weight, bias) raise RuntimeError( - "vLLM AllSpark W8A16 fused kernel is unavailable, and bf16 weight is not present. " + "vLLM AllSpark W8A16 fused kernel is unavailable. " "Please ensure vLLM custom ops are installed and loadable (`import vllm._custom_ops`)." ) - # AllSpark kernel requires CUDA and contiguous inputs. - with torch.profiler.record_function("w8a16/device_dtype_checks"): - if x2.device.type != "cuda": - return self._fallback(x, weight, qweight, scales, bias) - - if x2.dtype != torch.bfloat16: - x2 = x2.to(dtype=torch.bfloat16) - - # Shape checks: x2 [M,K], qweight [N_32align,K] - with torch.profiler.record_function("w8a16/shape_checks"): - m, k = x2.shape - n_32, k_w = qweight.shape - if k_w != k: - return self._fallback(x, weight, qweight, scales, bias) - if k % 16 != 0: - return self._fallback(x, weight, qweight, scales, bias) - - # Recover real N from module bias/metadata if available; default to n_32. - # In Diffulex, LinearBase stores output_size; but strategy doesn't receive module. - # So we infer N from bias if present else from scales length (can be N_32align). - with torch.profiler.record_function("w8a16/infer_n_and_sm"): - if out_features is not None: - n = int(out_features) - else: - # Backward compatible fallback. - n = int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32)) - if n <= 0 or n > n_32: - n = n_32 - - sm_count, sm_version = self._get_sm_info(x2.device) - cublas_thr = self._cublas_m_thr - - # vLLM allspark expects scales as 1xN (or equivalent contiguous view). - # NOTE: reshape/view doesn't allocate; only materialize contiguous copies when needed. - with torch.profiler.record_function("w8a16/prepare_contiguous_and_scales"): - if not x2.is_contiguous(): - x2 = x2.contiguous() - # qweight/scales are made contiguous at load-time (`LinearBase.set_quantized_weight`) - # and by `quantize_weight_for_kernel` return values. - if scales.dim() == 2: - scales_1xn = scales - else: - scales_1xn = scales.view(1, -1) - - with torch.profiler.record_function("w8a16/call_fused_gemm"): - y2 = _allspark_w8a16_gemm( - x2, - qweight, - scales_1xn, - None, # b_qzeros - n, - -1, # group_size (only supports -1) - sm_count, - sm_version, - cublas_thr, - False, # has_zp - True, # n32k16_reorder - ) - if bias is not None: - y2 = y2 + bias - - # Reshape back - with torch.profiler.record_function("w8a16/reshape_output"): - if x.dim() == 1: - y = y2.squeeze(0) - elif x.dim() == 2: - y = y2 + orig_shape = x.shape + x2 = x.reshape(-1, x.shape[-1]) if x.dim() != 2 else x + if x2.device.type != "cuda": + if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16): + return F.linear(x, weight, bias) + raise RuntimeError("AllSpark W8A16 requires CUDA inputs.") + + if x2.dtype != torch.bfloat16: + x2 = x2.to(dtype=torch.bfloat16) + if not x2.is_contiguous(): + x2 = x2.contiguous() + + # Load-time quantized module path: weight is uint8/int8 buffer and scales provided. + if weight is not None and weight.dtype in (torch.uint8, torch.int8): + if quant_scales is None: + raise ValueError("quant_scales is required when weight is quantized") + qweight = weight + scales = quant_scales + else: + # Safety net for bf16 weights (should be rare in steady-state). + weight_id = id(weight) + cached = self._weight_cache.get(weight_id) + if cached is None or cached[0].device != x2.device: + qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device) + self._weight_cache[weight_id] = (qweight, scales) else: - y = y2.reshape(*orig_shape[:-1], y2.shape[-1]) - return y + qweight, scales = cached + + m, k = x2.shape + n_32, k_w = qweight.shape + if k_w != k or (k & 15) != 0: + if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16): + y = F.linear(x, weight, bias) + return y + raise RuntimeError(f"AllSpark W8A16 requires K%16==0 and matching K. Got x.K={k}, w.K={k_w}.") + + n = int(out_features) if out_features is not None else (int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32))) + n = n_32 if (n <= 0 or n > n_32) else n + scales_1xn = scales if scales.dim() == 2 else scales.view(1, -1) + + sm_count, sm_version = self._get_sm_info(x2.device) + y2 = _vllm_ops.allspark_w8a16_gemm( + x2, + qweight, + scales_1xn, + None, # b_qzeros + n, + -1, # group_size (only supports -1) + sm_count, + sm_version, + self._cublas_m_thr, + False, # has_zp + True, # n32k16_reorder + ) + if bias is not None: + y2 = y2 + bias + if orig_shape == x2.shape: + return y2 + if x.dim() == 1: + return y2.squeeze(0) + return y2.reshape(*orig_shape[:-1], y2.shape[-1]) # NOTE: We intentionally do not provide a generic dequantize+F.linear fallback for reordered weights. # It materializes a full bf16 matrix and is prone to OOM on large models. diff --git a/profile/torch_d2f_profiler.py b/profile/torch_d2f_profiler.py index 7688154..780ef2f 100644 --- a/profile/torch_d2f_profiler.py +++ b/profile/torch_d2f_profiler.py @@ -109,6 +109,9 @@ def main() -> None: # Engine settings (force single-process profiling by default) parser.add_argument("--tensor-parallel-size", type=int, default=1, help="建议保持 1,否则会 spawn 子进程导致采集不到 CUDA") parser.add_argument("--data-parallel-size", type=int, default=1) + # Distributed comm (avoid port conflicts with other local runs) + parser.add_argument("--master-addr", type=str, default="localhost") + parser.add_argument("--master-port", type=int, default=2333) parser.add_argument("--gpu-memory-utilization", type=float, default=0.30) parser.add_argument("--max-model-len", type=int, default=1024) @@ -171,6 +174,8 @@ def main() -> None: enforce_eager=True, tensor_parallel_size=args.tensor_parallel_size, data_parallel_size=args.data_parallel_size, + master_addr=args.master_addr, + master_port=args.master_port, gpu_memory_utilization=args.gpu_memory_utilization, max_model_len=args.max_model_len, max_num_batched_tokens=max(1024, args.max_model_len), From 7fba595c2189b196bb9a44ae20a7fe88f90cde72 Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Mon, 26 Jan 2026 03:37:36 +0000 Subject: [PATCH 08/10] perf: cache linear forward dispatch for CUDA Graph - Add per-layer ForwardPlan to pre-resolve bf16/quant/offline paths and reduce per-call Python branching. - Prefer direct torch.ops kernels (GPTQ/AWQ/Marlin) with static args for stable capture. - Fix D2F static CUDA graph capture/replay metadata (token buckets + cu_seqlens) and add profiler flag. --- diffulex/layer/linear.py | 931 ++++++++++++++++++ .../block_diffusion/engine/model_runner.py | 10 + diffulex/strategy/d2f/engine/model_runner.py | 139 ++- .../fast_dllm_v2/engine/model_runner.py | 10 + .../strategies/linear_awq_w4a16.py | 17 +- .../strategies/linear_gptq_w4a16.py | 12 +- profile/torch_d2f_profiler.py | 9 +- 7 files changed, 1107 insertions(+), 21 deletions(-) diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py index 5cc4b6d..9dbef0e 100755 --- a/diffulex/layer/linear.py +++ b/diffulex/layer/linear.py @@ -1,3 +1,6 @@ +from __future__ import annotations + +from dataclasses import dataclass from typing import Optional import torch @@ -13,6 +16,500 @@ def divide(numerator, denominator): return numerator // denominator +@dataclass +class _ForwardPlanSig: + """Signature for validating cached forward plans. + + We intentionally keep it small and Python-only so it is CUDA-graph friendly + (no `.item()` and no device sync). + """ + + device_type: str + device_index: int + x_dtype: torch.dtype + x_shape: tuple[int, ...] + has_bias: bool + mode: str # "bf16" | "quant" | "offline" + strategy_name: str + + +class _ForwardPlanBase: + sig: _ForwardPlanSig + + def __call__(self, x: torch.Tensor) -> torch.Tensor: # pragma: no cover + raise NotImplementedError + + +class _BF16Plan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + weight: torch.Tensor, + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._weight = weight + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return F.linear(x, self._weight, self._bias) + + +class _QuantInt8W8A16Plan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + strategy, + quant_kind: str, + qweight: torch.Tensor, + scales_1xn: torch.Tensor, + out_features: int, + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._strategy = strategy + self._quant_kind = (quant_kind or "other").strip().lower() or "other" + self._qweight = qweight + self._scales_1xn = scales_1xn + self._out_features = int(out_features) + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return self._strategy.linear_forward( + x, + self._qweight, + self._bias, + quant_kind=self._quant_kind, + quant_scales=self._scales_1xn, + out_features=self._out_features, + ) + + +class _QuantInt8W8A8Plan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + strategy, + quant_kind: str, + qweight: torch.Tensor, + scales_1xn: torch.Tensor, + out_features: int, + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._strategy = strategy + self._quant_kind = (quant_kind or "other").strip().lower() or "other" + self._qweight = qweight + self._scales_1xn = scales_1xn + self._out_features = int(out_features) + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return self._strategy.linear_forward( + x, + self._qweight, + self._bias, + quant_kind=self._quant_kind, + quant_scales=self._scales_1xn, + out_features=self._out_features, + ) + + +class _QuantGenericPlan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + strategy, + quant_kind: str, + weight: torch.Tensor, + scales: torch.Tensor, + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._strategy = strategy + self._quant_kind = (quant_kind or "other").strip().lower() or "other" + self._weight = weight + self._scales = scales + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return self._strategy.linear_forward( + x, + self._weight, + self._bias, + quant_kind=self._quant_kind, + quant_scales=self._scales, + ) + + +class _OfflineGPTQPlan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + strategy, + quant_kind: str, + qweight: torch.Tensor, + qzeros: torch.Tensor, + scales: torch.Tensor, + g_idx: torch.Tensor, + weight_bits: int, + out_features: int, + in_features: int, + group_size: int, + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._strategy = strategy + self._quant_kind = (quant_kind or "other").strip().lower() or "other" + self._qweight = qweight + self._qzeros = qzeros + self._scales = scales + self._g_idx = g_idx + self._weight_bits = int(weight_bits) + self._out_features = int(out_features) + self._in_features = int(in_features) + self._group_size = int(group_size) + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return self._strategy.linear_forward( + x, + None, + self._bias, + quant_kind=self._quant_kind, + gptq_qweight=self._qweight, + gptq_qzeros=self._qzeros, + gptq_scales=self._scales, + gptq_g_idx=self._g_idx, + weight_bits=self._weight_bits, + use_v2_format=False, + out_features=self._out_features, + in_features=self._in_features, + group_size=self._group_size, + ) + + +class _OfflineAWQPlan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + strategy, + quant_kind: str, + qweight: torch.Tensor, + qzeros: torch.Tensor, + scales: torch.Tensor, + pack_factor: int, + out_features: int, + in_features: int, + group_size: int, + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._strategy = strategy + self._quant_kind = (quant_kind or "other").strip().lower() or "other" + self._qweight = qweight + self._qzeros = qzeros + self._scales = scales + self._pack_factor = int(pack_factor) + self._out_features = int(out_features) + self._in_features = int(in_features) + self._group_size = int(group_size) + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return self._strategy.linear_forward( + x, + None, + self._bias, + quant_kind=self._quant_kind, + awq_qweight=self._qweight, + awq_qzeros=self._qzeros, + awq_scales=self._scales, + pack_factor=self._pack_factor, + out_features=self._out_features, + in_features=self._in_features, + group_size=self._group_size, + ) + + +class _OfflineGPTQMarlinPlan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + strategy, + quant_kind: str, + qweight: torch.Tensor, + scales: torch.Tensor, + zp: torch.Tensor, + g_idx: torch.Tensor, + g_idx_sort_indices: torch.Tensor, + workspace: torch.Tensor, + in_features: int, + out_features: int, + group_size: int, + weight_bits: int, + tp_dim: Optional[int], + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._strategy = strategy + self._quant_kind = (quant_kind or "other").strip().lower() or "other" + self._qweight = qweight + self._scales = scales + self._zp = zp + self._g_idx = g_idx + self._g_idx_sort_indices = g_idx_sort_indices + self._workspace = workspace + self._in_features = int(in_features) + self._out_features = int(out_features) + self._group_size = int(group_size) + self._weight_bits = int(weight_bits) + self._tp_dim = tp_dim + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return self._strategy.linear_forward( + x, + None, + self._bias, + quant_kind=self._quant_kind, + qweight=self._qweight, + scales=self._scales, + zp=self._zp, + g_idx=self._g_idx, + g_idx_sort_indices=self._g_idx_sort_indices, + workspace=self._workspace, + in_features=self._in_features, + out_features=self._out_features, + group_size=self._group_size, + weight_bits=self._weight_bits, + tp_dim=self._tp_dim, + ) + + +class _OfflineAWQMarlinPlan(_ForwardPlanBase): + def __init__( + self, + *, + sig: _ForwardPlanSig, + strategy, + quant_kind: str, + qweight: torch.Tensor, + scales: torch.Tensor, + zp: torch.Tensor, + workspace: torch.Tensor, + in_features: int, + out_features: int, + group_size: int, + tp_dim: Optional[int], + bias: Optional[torch.Tensor], + ) -> None: + self.sig = sig + self._strategy = strategy + self._quant_kind = (quant_kind or "other").strip().lower() or "other" + self._qweight = qweight + self._scales = scales + self._zp = zp + self._workspace = workspace + self._in_features = int(in_features) + self._out_features = int(out_features) + self._group_size = int(group_size) + self._tp_dim = tp_dim + self._bias = bias + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + return self._strategy.linear_forward( + x, + None, + self._bias, + quant_kind=self._quant_kind, + qweight=self._qweight, + scales=self._scales, + zp=self._zp, + workspace=self._workspace, + in_features=self._in_features, + out_features=self._out_features, + group_size=self._group_size, + tp_dim=self._tp_dim, + ) + + +class _DirectGPTQGemmPlan(_ForwardPlanBase): + """Direct GPTQ GEMM plan (bypass Python strategy glue). + + This calls `torch.ops._C.gptq_gemm` directly with pre-resolved static args. + """ + + def __init__( + self, + *, + sig: _ForwardPlanSig, + qweight: torch.Tensor, + qzeros: torch.Tensor, + scales: torch.Tensor, + g_idx: torch.Tensor, + weight_bits: int, + out_features: int, + bias: Optional[torch.Tensor], + use_exllama: bool = True, + use_v2_format: bool = False, + cast_back_to_x_dtype: bool = True, + ) -> None: + self.sig = sig + self._qweight = qweight + self._qzeros = qzeros + self._scales = scales + self._g_idx = g_idx + self._weight_bits = int(weight_bits) + self._out_features = int(out_features) + self._bias = bias + self._use_exllama = bool(use_exllama) + self._use_v2_format = bool(use_v2_format) + self._cast_back = bool(cast_back_to_x_dtype) + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + # vLLM GPTQ kernels expect FP16 activations. + x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16) + x2 = x_in.reshape(-1, x_in.shape[-1]) if x_in.dim() != 2 else x_in + if not x2.is_contiguous(): + x2 = x2.contiguous() + + out = torch.ops._C.gptq_gemm( + x2, + self._qweight, + self._qzeros, + self._scales, + self._g_idx, + self._use_exllama, + self._use_v2_format, + self._weight_bits, + ) + if self._bias is not None: + out.add_(self._bias.to(dtype=out.dtype)) + out = out.reshape(x.shape[:-1] + (self._out_features,)) + if self._cast_back and out.dtype != x.dtype: + return out.to(dtype=x.dtype) + return out + + +class _DirectAWQGemmPlan(_ForwardPlanBase): + """Direct AWQ GEMM plan (bypass Python strategy glue).""" + + def __init__( + self, + *, + sig: _ForwardPlanSig, + awq_gemm, + qweight: torch.Tensor, + qzeros: torch.Tensor, + scales: torch.Tensor, + out_features: int, + bias: Optional[torch.Tensor], + split_k_iters: int = 1, + cast_back_to_x_dtype: bool = True, + ) -> None: + self.sig = sig + self._awq_gemm = awq_gemm + self._qweight = qweight + self._qzeros = qzeros + self._scales = scales + self._out_features = int(out_features) + self._bias = bias + self._split_k_iters = int(split_k_iters) + self._cast_back = bool(cast_back_to_x_dtype) + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + # vLLM AWQ kernels expect FP16 activations. + x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16) + reshaped_x = x_in.reshape(-1, x_in.shape[-1]) + if not reshaped_x.is_contiguous(): + reshaped_x = reshaped_x.contiguous() + + out = self._awq_gemm(reshaped_x, self._qweight, self._qzeros, self._scales, self._split_k_iters) + if self._bias is not None: + out.add_(self._bias.to(dtype=out.dtype)) + out = out.reshape(x.shape[:-1] + (self._out_features,)) + if self._cast_back and out.dtype != x.dtype: + return out.to(dtype=x.dtype) + return out + + +class _DirectMarlinGemmPlan(_ForwardPlanBase): + """Direct Marlin GEMM plan (bypass Python strategy glue). + + This calls `torch.ops._C.gptq_marlin_gemm` directly with pre-resolved static args. + """ + + def __init__( + self, + *, + sig: _ForwardPlanSig, + qweight: torch.Tensor, + scales: torch.Tensor, + zp: torch.Tensor, + g_idx: torch.Tensor, + g_idx_sort_indices: torch.Tensor, + workspace: torch.Tensor, + wtype_id: int, + n: int, + is_k_full: bool, + use_atomic_add: bool, + marlin_bias: Optional[torch.Tensor], + cast_back_to_x_dtype: bool = True, + ) -> None: + self.sig = sig + self._qweight = qweight + self._scales = scales + self._zp = zp + self._g_idx = g_idx + self._g_idx_sort_indices = g_idx_sort_indices + self._workspace = workspace + self._wtype_id = int(wtype_id) + self._n = int(n) + self._is_k_full = bool(is_k_full) + self._use_atomic_add = bool(use_atomic_add) + self._bias = marlin_bias + self._cast_back = bool(cast_back_to_x_dtype) + + def __call__(self, x: torch.Tensor) -> torch.Tensor: + reshaped_x = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (int(self._n),) + m = int(reshaped_x.shape[0]) + k = int(reshaped_x.shape[1]) + out = torch.ops._C.gptq_marlin_gemm( + reshaped_x, + None, + self._qweight, + self._bias, + self._scales, + None, + None, + self._zp, + self._g_idx, + self._g_idx_sort_indices, + self._workspace, + self._wtype_id, + m, + int(self._n), + k, + self._is_k_full, + self._use_atomic_add, + True, # use_fp32_reduce + False, # is_zp_float + ) + out = out.reshape(out_shape) + if self._cast_back and out.dtype != x.dtype: + return out.to(dtype=x.dtype) + return out + + class LoRAMixin: """Mixin class to add LoRA support to existing linear layers.""" def __init_lora__(self, r: int = 0, lora_alpha: float = 1.0, lora_dropout: float = 0.0): @@ -147,6 +644,404 @@ def __init__( self._gptq_marlin_is_prepared_py: bool = False self._awq_marlin_is_prepared_py: bool = False + # ---- Forward plan cache (for static/graph-friendly dispatch) ---- + # When enabled, we build a per-layer callable plan that fixes the runtime + # dispatch decisions (bf16 vs quant vs offline, and which concrete kernel path). + # This removes heavy Python branching from the hot path and makes CUDA graph + # capture more stable. + self._forward_plan_enabled: bool = False + self._forward_plan: Optional[_ForwardPlanBase] = None + + def _invalidate_forward_plan(self) -> None: + self._forward_plan = None + + @staticmethod + def _device_index(device: torch.device) -> int: + if device.type == "cuda" and device.index is not None: + return int(device.index) + return -1 + + def enable_forward_plan(self, enabled: bool = True) -> None: + """Enable/disable cached forward plan dispatch for this layer.""" + self._forward_plan_enabled = bool(enabled) + if not self._forward_plan_enabled: + self._invalidate_forward_plan() + + def build_forward_plan_for_static(self, example_x: torch.Tensor, bias: Optional[torch.Tensor]) -> None: + """Build a cached forward plan for a fixed static decode-step shape. + + This should be called during warmup/capture. After building, `_forward_base` + can execute with minimal Python overhead by invoking the cached plan. + """ + strategy = self._get_linear_strategy() + # Ensure we don't keep bf16 and quant weights both resident. + self._maybe_promote_weight_to_quantized_at_runtime(example_x, strategy) + + device = example_x.device + dev_idx = self._device_index(device) + has_bias = bias is not None + strategy_name = getattr(strategy, "name", "") if strategy is not None else "" + + # Offline quantized weights have highest priority. + if self.has_offline_quantized_weight(): + if strategy is None: + raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.") + weight_format = getattr(strategy, "linear_weight_format", None) + out_features, in_features, group_size = self._offline_meta() + sig = _ForwardPlanSig( + device_type=device.type, + device_index=dev_idx, + x_dtype=example_x.dtype, + x_shape=tuple(int(x) for x in example_x.shape), + has_bias=has_bias, + mode="offline", + strategy_name=strategy_name, + ) + + if weight_format == "gptq": + self._maybe_prepare_offline_gptq(example_x) + bits = self._infer_gptq_weight_bits(in_features=in_features) + # Use already-correct g_idx buffer (can be empty), moved once to the example device. + g_idx = self.gptq_g_idx + if g_idx.device != device: + g_idx = g_idx.to(device=device, dtype=torch.int) + + # Prefer direct torch.ops entry point to bypass Python strategy glue. + if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_gemm"): + self._forward_plan = _DirectGPTQGemmPlan( + sig=sig, + qweight=self.gptq_qweight, + qzeros=self.gptq_qzeros, + scales=self.gptq_scales, + g_idx=g_idx, + weight_bits=bits, + out_features=out_features, + bias=bias, + use_exllama=True, + use_v2_format=False, + cast_back_to_x_dtype=True, + ) + else: + self._forward_plan = _OfflineGPTQPlan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.gptq_qweight, + qzeros=self.gptq_qzeros, + scales=self.gptq_scales, + g_idx=g_idx, + weight_bits=bits, + out_features=out_features, + in_features=in_features, + group_size=group_size, + bias=bias, + ) + return + + if weight_format == "awq": + bits = int(self._offline_quant_bits_py) if int(self._offline_quant_bits_py) > 0 else 4 + pack_factor = 32 // max(1, bits) + # Prefer direct torch.ops entry point to bypass Python strategy glue. + awq_gemm = None + try: + if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"): + awq_gemm = torch.ops._C.awq_gemm + except Exception: + awq_gemm = None + + if awq_gemm is not None: + self._forward_plan = _DirectAWQGemmPlan( + sig=sig, + awq_gemm=awq_gemm, + qweight=self.awq_qweight, + qzeros=self.awq_qzeros, + scales=self.awq_scales, + out_features=out_features, + bias=bias, + split_k_iters=1, + cast_back_to_x_dtype=True, + ) + else: + self._forward_plan = _OfflineAWQPlan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.awq_qweight, + qzeros=self.awq_qzeros, + scales=self.awq_scales, + pack_factor=pack_factor, + out_features=out_features, + in_features=in_features, + group_size=group_size, + bias=bias, + ) + return + + if weight_format == "gptq_marlin": + self._maybe_prepare_offline_gptq_marlin(example_x) + bits = self._infer_gptq_weight_bits(in_features=in_features) + # Prefer direct torch.ops entry point to bypass Python strategy glue. + if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_marlin_gemm"): + try: + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore + marlin_is_k_full, + marlin_make_empty_g_idx, + should_use_atomic_add_reduce, + marlin_permute_bias, + ) + from vllm.scalar_type import scalar_types # type: ignore + except Exception: + marlin_is_k_full = None # type: ignore + marlin_make_empty_g_idx = None # type: ignore + should_use_atomic_add_reduce = None # type: ignore + marlin_permute_bias = None # type: ignore + scalar_types = None # type: ignore + + if scalar_types is None: + # Fall back to the strategy path if vLLM marlin utils are unavailable. + self._forward_plan = _OfflineGPTQMarlinPlan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.gptq_marlin_qweight, + scales=self.gptq_marlin_scales, + zp=self.gptq_marlin_zp, + g_idx=self.gptq_marlin_g_idx, + g_idx_sort_indices=self.gptq_marlin_g_idx_sort_indices, + workspace=self.gptq_marlin_workspace, + in_features=in_features, + out_features=out_features, + group_size=group_size, + weight_bits=bits, + tp_dim=self.tp_dim, + bias=bias, + ) + return + + device = example_x.device + dev_key = self._device_index(device) + # Prefer already prepared tensors; if missing, use cached empties. + def _empty() -> torch.Tensor: + if marlin_make_empty_g_idx is not None: + return marlin_make_empty_g_idx(device) + return torch.empty((0,), device=device, dtype=torch.int32) + + g_idx = self.gptq_marlin_g_idx if self.gptq_marlin_g_idx.numel() > 0 else _empty() + g_idx_sort = ( + self.gptq_marlin_g_idx_sort_indices + if self.gptq_marlin_g_idx_sort_indices.numel() > 0 + else _empty() + ) + row_parallel = bool(self.tp_dim == 1) + has_g_idx = bool(g_idx.numel() > 0) + is_k_full = True if marlin_is_k_full is None else marlin_is_k_full(has_g_idx, row_parallel) + + marlin_bias = None + if bias is not None: + marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias + + reshaped_x = example_x.reshape(-1, example_x.shape[-1]) + m = int(reshaped_x.shape[0]) + n = int(out_features) + k = int(reshaped_x.shape[1]) + use_atomic_add = False + if should_use_atomic_add_reduce is not None: + use_atomic_add = bool( + should_use_atomic_add_reduce(m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype) + ) + + if bits == 4: + wtype = scalar_types.uint4b8 + elif bits == 8: + wtype = scalar_types.uint8b128 + else: + raise RuntimeError(f"gptq_marlin: unsupported weight_bits={bits} (expected 4 or 8)") + + self._forward_plan = _DirectMarlinGemmPlan( + sig=sig, + qweight=self.gptq_marlin_qweight, + scales=self.gptq_marlin_scales, + zp=self.gptq_marlin_zp, + g_idx=g_idx, + g_idx_sort_indices=g_idx_sort, + workspace=self.gptq_marlin_workspace, + wtype_id=wtype.id, + n=out_features, + is_k_full=is_k_full, + use_atomic_add=use_atomic_add, + marlin_bias=marlin_bias, + cast_back_to_x_dtype=True, + ) + else: + self._forward_plan = _OfflineGPTQMarlinPlan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.gptq_marlin_qweight, + scales=self.gptq_marlin_scales, + zp=self.gptq_marlin_zp, + g_idx=self.gptq_marlin_g_idx, + g_idx_sort_indices=self.gptq_marlin_g_idx_sort_indices, + workspace=self.gptq_marlin_workspace, + in_features=in_features, + out_features=out_features, + group_size=group_size, + weight_bits=bits, + tp_dim=self.tp_dim, + bias=bias, + ) + return + + if weight_format == "awq_marlin": + self._maybe_prepare_offline_awq_marlin(example_x) + if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_marlin_gemm"): + try: + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( # type: ignore + marlin_make_empty_g_idx, + should_use_atomic_add_reduce, + marlin_permute_bias, + ) + from vllm.scalar_type import scalar_types # type: ignore + except Exception: + marlin_make_empty_g_idx = None # type: ignore + should_use_atomic_add_reduce = None # type: ignore + marlin_permute_bias = None # type: ignore + scalar_types = None # type: ignore + + if scalar_types is None: + self._forward_plan = _OfflineAWQMarlinPlan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.awq_marlin_qweight, + scales=self.awq_marlin_scales, + zp=self.awq_marlin_zp, + workspace=self.awq_marlin_workspace, + in_features=in_features, + out_features=out_features, + group_size=group_size, + tp_dim=self.tp_dim, + bias=bias, + ) + return + + device = example_x.device + empty = ( + marlin_make_empty_g_idx(device) + if marlin_make_empty_g_idx is not None + else torch.empty((0,), device=device, dtype=torch.int32) + ) + marlin_bias = None + if bias is not None: + marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias + + reshaped_x = example_x.reshape(-1, example_x.shape[-1]) + m = int(reshaped_x.shape[0]) + n = int(out_features) + k = int(reshaped_x.shape[1]) + use_atomic_add = False + if should_use_atomic_add_reduce is not None: + use_atomic_add = bool( + should_use_atomic_add_reduce(m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype) + ) + + self._forward_plan = _DirectMarlinGemmPlan( + sig=sig, + qweight=self.awq_marlin_qweight, + scales=self.awq_marlin_scales, + zp=self.awq_marlin_zp, + g_idx=empty, + g_idx_sort_indices=empty, + workspace=self.awq_marlin_workspace, + wtype_id=scalar_types.uint4.id, + n=out_features, + is_k_full=True, + use_atomic_add=use_atomic_add, + marlin_bias=marlin_bias, + cast_back_to_x_dtype=True, + ) + else: + self._forward_plan = _OfflineAWQMarlinPlan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.awq_marlin_qweight, + scales=self.awq_marlin_scales, + zp=self.awq_marlin_zp, + workspace=self.awq_marlin_workspace, + in_features=in_features, + out_features=out_features, + group_size=group_size, + tp_dim=self.tp_dim, + bias=bias, + ) + return + + # If a new offline strategy is added, fall back to the generic runtime dispatcher. + raise RuntimeError( + f"Offline quantized weight is present but strategy weight_format={weight_format!r} is not supported by forward plan." + ) + + # Online/load-time quantized weights. + if self.has_quantized_weight(): + if strategy is None: + raise RuntimeError("Quantized weight is present but no linear strategy is configured.") + sig = _ForwardPlanSig( + device_type=device.type, + device_index=dev_idx, + x_dtype=example_x.dtype, + x_shape=tuple(int(x) for x in example_x.shape), + has_bias=has_bias, + mode="quant", + strategy_name=strategy_name, + ) + if getattr(strategy, "name", "") == "linear_int8_w8a16": + self._forward_plan = _QuantInt8W8A16Plan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.quant_weight_int8, + scales_1xn=self.quant_scales_1xn, + out_features=self._forward_out_features, + bias=bias, + ) + return + if getattr(strategy, "name", "") == "linear_int8_w8a8": + self._forward_plan = _QuantInt8W8A8Plan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + qweight=self.quant_weight_int8, + scales_1xn=self.quant_scales_1xn, + out_features=self._forward_out_features, + bias=bias, + ) + return + self._forward_plan = _QuantGenericPlan( + sig=sig, + strategy=strategy, + quant_kind=self.quant_kind, + weight=self.quant_weight_int8, + scales=self.quant_scales, + bias=bias, + ) + return + + # BF16 weights (no quant). + weight = getattr(self, "weight", None) + if weight is None: + raise RuntimeError("No quantized/offline weights are present but bf16 weight is missing.") + sig = _ForwardPlanSig( + device_type=device.type, + device_index=dev_idx, + x_dtype=example_x.dtype, + x_shape=tuple(int(x) for x in example_x.shape), + has_bias=has_bias, + mode="bf16", + strategy_name=strategy_name, + ) + self._forward_plan = _BF16Plan(sig=sig, weight=weight, bias=bias) + def has_quantized_weight(self) -> bool: return self._weight_is_quantized_py and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0 @@ -364,6 +1259,9 @@ def _infer_module_device() -> torch.device: self._parameters.pop("weight", None) setattr(self, "weight", None) + # Offline weights changed; cached forward plan is no longer valid. + self._invalidate_forward_plan() + def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None: """Prepare vLLM GPTQ weights on first use (required gptq_shuffle).""" if self._offline_quant_format_py != 1: @@ -669,6 +1567,8 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to self.quant_scales_1xn = quant_scales if quant_scales.dim() == 2 else quant_scales.view(1, -1) self._weight_is_quantized.fill_(True) self._weight_is_quantized_py = True + # Quant buffers changed; cached forward plan is no longer valid. + self._invalidate_forward_plan() def _maybe_promote_weight_to_quantized_at_runtime( self, @@ -879,6 +1779,37 @@ def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict: def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: """Unified forward dispatcher for bf16 / online quant / offline GPTQ/AWQ.""" + if getattr(self, "_forward_plan_enabled", False): + plan = getattr(self, "_forward_plan", None) + if plan is None: + self.build_forward_plan_for_static(x, bias) + plan = getattr(self, "_forward_plan", None) + if plan is not None: + sig = plan.sig + dev = x.device + dev_idx = self._device_index(dev) + if ( + sig.device_type == dev.type + and sig.device_index == dev_idx + and sig.x_dtype == x.dtype + and sig.x_shape == tuple(int(v) for v in x.shape) + and sig.has_bias == (bias is not None) + ): + return plan(x) + # Static mode but shape/dtype changed: rebuild once and retry. + self.build_forward_plan_for_static(x, bias) + plan = getattr(self, "_forward_plan", None) + if plan is not None: + sig = plan.sig + if ( + sig.device_type == dev.type + and sig.device_index == dev_idx + and sig.x_dtype == x.dtype + and sig.x_shape == tuple(int(v) for v in x.shape) + and sig.has_bias == (bias is not None) + ): + return plan(x) + strategy = self._get_linear_strategy() # Runtime safety net: ensure we don't keep bf16+quant weights both resident. self._maybe_promote_weight_to_quantized_at_runtime(x, strategy) diff --git a/diffulex/strategy/block_diffusion/engine/model_runner.py b/diffulex/strategy/block_diffusion/engine/model_runner.py index cc53221..61a4f99 100644 --- a/diffulex/strategy/block_diffusion/engine/model_runner.py +++ b/diffulex/strategy/block_diffusion/engine/model_runner.py @@ -187,6 +187,16 @@ def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]: @torch.inference_mode() def capture_cudagraph(self): + # Enable per-layer forward-plan dispatch to stabilize capture and minimize + # Python branching inside the captured region. + try: + from diffulex.layer.linear import LinearBase + for m in self.model.modules(): + if isinstance(m, LinearBase): + m.enable_forward_plan(True) + except Exception: + pass + set_warming_up(True) config = self.config hf_config = config.hf_config diff --git a/diffulex/strategy/d2f/engine/model_runner.py b/diffulex/strategy/d2f/engine/model_runner.py index c06fbcd..839c848 100644 --- a/diffulex/strategy/d2f/engine/model_runner.py +++ b/diffulex/strategy/d2f/engine/model_runner.py @@ -292,20 +292,34 @@ def get_step(diff_blk, begin_idx): def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool): if is_prefill or self.enforce_eager or input_ids.size(0) > 512: return self.model.compute_logits(self.model(input_ids, positions)) - bs = input_ids.size(0) + num_tokens = input_ids.size(0) context = fetch_d2f_attn_metadata() - graph = self.graphs[next(x for x in self.graph_bs if x >= bs)] + bucket_tokens = next(x for x in self.graph_bs if x >= num_tokens) + graph = self.graphs[bucket_tokens] graph_vars = self.graph_vars for key, value in graph_vars.items(): if key != "outputs": value.zero_() - graph_vars["input_ids"][:bs] = input_ids - graph_vars["positions"][:bs] = positions - graph_vars["slot_mapping"][:bs] = context.slot_mapping - graph_vars["context_lens"][:bs] = context.context_lens - graph_vars["block_tables"][:bs, : context.block_tables.size(1)] = context.block_tables + graph_vars["input_ids"][:num_tokens] = input_ids + graph_vars["positions"][:num_tokens] = positions + graph_vars["slot_mapping"][:num_tokens] = context.slot_mapping + num_seqs = int(context.context_lens.numel()) + graph_vars["context_lens"][:num_seqs] = context.context_lens + # cu_seqlens are required by unified paged-attn decode kernels. + if getattr(context, "cu_seqlens_q", None) is not None: + graph_vars["cu_seqlens_q"][: num_seqs + 1] = context.cu_seqlens_q + bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size))) + if bucket_num_seqs > num_seqs: + graph_vars["cu_seqlens_q"][num_seqs + 1 : bucket_num_seqs + 1].fill_(int(num_tokens)) + if getattr(context, "cu_seqlens_k", None) is not None: + graph_vars["cu_seqlens_k"][: num_seqs + 1] = context.cu_seqlens_k + bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size))) + if bucket_num_seqs > num_seqs: + last_k = context.cu_seqlens_k[num_seqs] + graph_vars["cu_seqlens_k"][num_seqs + 1 : bucket_num_seqs + 1] = last_k + graph_vars["block_tables"][:num_seqs, : context.block_tables.size(1)] = context.block_tables graph.replay() - return self.model.compute_logits(graph_vars["outputs"][:bs]) + return self.model.compute_logits(graph_vars["outputs"][:num_tokens]) def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]: input_ids, positions = self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs) @@ -317,8 +331,107 @@ def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]: @torch.inference_mode() def capture_cudagraph(self): - """ - TODO: Varlen decoding does not support CUDA graph capture yet. - Can be implemented, but requires drastically high overhead. - """ - raise NotImplementedError("CUDA graph capture for DiffusionLM is not implemented yet.") + # Static-mode CUDA graph capture for D2F decode. + # + # NOTE: + # - This matches `run_model()`'s replay protocol: we only overwrite + # input_ids/positions/slot_mapping/context_lens/block_tables per step. + # - Varlen mode is intentionally not supported here (assume static flow). + from tqdm import tqdm + + # Enable per-layer forward-plan dispatch to stabilize capture and minimize + # Python branching inside the captured region. + try: + from diffulex.layer.linear import LinearBase + for m in self.model.modules(): + if isinstance(m, LinearBase): + m.enable_forward_plan(True) + except Exception: + pass + + set_warming_up(True) + config = self.config + hf_config = config.hf_config + diffusion_block_size = int(self.diffusion_block_size) + max_num_seqs = int(self.config.max_num_seqs) + # Graph path is only used when num_tokens <= 512. + max_num_seqs_for_graph = max(1, min(max_num_seqs, 512 // max(1, diffusion_block_size))) + max_num_tokens = max_num_seqs_for_graph * diffusion_block_size + max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size + + # Allocate graph buffers on the same device/dtype as the model. + try: + p0 = next(self.model.parameters()) + graph_device = p0.device + graph_dtype = p0.dtype + except StopIteration: + graph_device = torch.device("cuda") + graph_dtype = torch.float16 + + # Allocate max-size graph buffers. + input_ids = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device) + positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device) + slot_mapping = torch.zeros(max_num_tokens, dtype=torch.int32, device=graph_device) + context_lens = torch.zeros(max_num_seqs_for_graph, dtype=torch.int32, device=graph_device) + block_tables = torch.zeros(max_num_seqs_for_graph, max_num_blocks, dtype=torch.int32, device=graph_device) + outputs = torch.zeros(max_num_tokens, hf_config.hidden_size, dtype=graph_dtype, device=graph_device) + cu_seqlens_q = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device) + cu_seqlens_k = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device) + + # Capture bucketed graphs by num_tokens (bucketed by num_seqs * diffusion_block_size). + self.graph_bs = [] + seq_bs_list = [1, 2, 4, 8] + list(range(16, max_num_seqs_for_graph + 1, 16)) + for num_seqs in sorted(set([b for b in seq_bs_list if b <= max_num_seqs_for_graph] + [max_num_seqs_for_graph])): + self.graph_bs.append(int(num_seqs) * diffusion_block_size) + self.graphs = {} + self.graph_pool = None + + for num_tokens in tqdm(reversed(self.graph_bs), desc="Capturing CUDA graphs"): + num_seqs = int(num_tokens // diffusion_block_size) + graph = torch.cuda.CUDAGraph() + # Fill placeholder metadata with valid monotonic cu_seqlens to satisfy kernel assertions. + cu_seqlens_q[: num_seqs + 1] = ( + torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * diffusion_block_size + ) + # Use a conservative max-seqlen for K to keep shapes stable; values are overwritten before replay. + cu_seqlens_k[: num_seqs + 1] = ( + torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * int(config.max_model_len) + ) + context_lens[:num_seqs].fill_(int(config.max_model_len)) + # For static decode, use placeholder metadata tensors; per-step values are copied + # into `graph_vars` before replay. + set_d2f_attn_metadata( + False, + slot_mapping=slot_mapping[:num_tokens], + context_lens=context_lens[:num_seqs], + cu_seqlens_q=cu_seqlens_q[: num_seqs + 1], + cu_seqlens_k=cu_seqlens_k[: num_seqs + 1], + max_seqlen_q=diffusion_block_size, + max_seqlen_k=int(config.max_model_len), + block_tables=block_tables[:num_seqs], + kv_cache_layout=self.config.kv_cache_layout, + need_kv_cache_store=True, + diffusion_block_size=self.diffusion_block_size, + decode_mode="static", + attn_type="full_attention", + ) + outputs[:num_tokens] = self.model(input_ids[:num_tokens], positions[:num_tokens]) # warmup + with torch.cuda.graph(graph, self.graph_pool): + outputs[:num_tokens] = self.model(input_ids[:num_tokens], positions[:num_tokens]) # capture + if self.graph_pool is None: + self.graph_pool = graph.pool() + self.graphs[num_tokens] = graph + torch.cuda.synchronize() + reset_d2f_attn_metadata() + + self.graph_vars = dict( + input_ids=input_ids, + positions=positions, + slot_mapping=slot_mapping, + context_lens=context_lens, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + block_tables=block_tables, + outputs=outputs, + ) + reset_warming_up() diff --git a/diffulex/strategy/fast_dllm_v2/engine/model_runner.py b/diffulex/strategy/fast_dllm_v2/engine/model_runner.py index f265c92..1f5f6c4 100644 --- a/diffulex/strategy/fast_dllm_v2/engine/model_runner.py +++ b/diffulex/strategy/fast_dllm_v2/engine/model_runner.py @@ -187,6 +187,16 @@ def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]: @torch.inference_mode() def capture_cudagraph(self): + # Enable per-layer forward-plan dispatch to stabilize capture and minimize + # Python branching inside the captured region. + try: + from diffulex.layer.linear import LinearBase + for m in self.model.modules(): + if isinstance(m, LinearBase): + m.enable_forward_plan(True) + except Exception: + pass + set_warming_up(True) config = self.config hf_config = config.hf_config diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py index 22295fa..7090d59 100644 --- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py @@ -32,7 +32,17 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy: class LinearAWQW4A16Strategy(LinearQuantizationStrategy): def __init__(self) -> None: super().__init__() - self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm")) + # Resolve the concrete kernel entry point once (avoid per-call dispatch). + awq_gemm = None + try: + if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"): + awq_gemm = torch.ops._C.awq_gemm + except Exception: + awq_gemm = None + if awq_gemm is None and ops is not None and hasattr(ops, "awq_gemm"): + awq_gemm = ops.awq_gemm + self._awq_gemm = awq_gemm + self._ops_available: bool = bool(self._awq_gemm is not None) @property def name(self) -> str: @@ -114,10 +124,7 @@ def linear_forward( # Always use awq_gemm to avoid large temporary dequantized weight allocations. # vLLM API: awq_gemm(input, qweight, qzeros, scales, split_k_iters) split_k_iters = 1 - if reshaped_x.is_contiguous() and qweight.is_contiguous() and qzeros.is_contiguous() and scales.is_contiguous(): - out = torch.ops._C.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters) - else: - out = ops.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters) + out = self._awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters) # type: ignore[misc] if bias is not None: out.add_(bias.to(dtype=out.dtype)) diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py index 95e5b9e..7adfd10 100644 --- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py @@ -37,6 +37,8 @@ class LinearGPTQW4A16Strategy(LinearQuantizationStrategy): def __init__(self) -> None: super().__init__() self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_gemm")) + # Cache empty g_idx tensor per device to avoid per-call allocations. + self._empty_cache: dict[int, torch.Tensor] = {} @property def name(self) -> str: @@ -121,10 +123,16 @@ def linear_forward( if not x2.is_contiguous(): x2 = x2.contiguous() + device = x.device + dev_key = int(device.index) if device.type == "cuda" and device.index is not None else -1 if g_idx is None or g_idx.numel() == 0: - g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int) + empty = self._empty_cache.get(dev_key) + if empty is None or empty.device != device: + empty = torch.empty((0,), device=device, dtype=torch.int) + self._empty_cache[dev_key] = empty + g_idx_t = empty else: - g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int) + g_idx_t = g_idx if (g_idx.device == device and g_idx.dtype == torch.int) else g_idx.to(device=device, dtype=torch.int) output = torch.ops._C.gptq_gemm( x2, diff --git a/profile/torch_d2f_profiler.py b/profile/torch_d2f_profiler.py index 780ef2f..e8d36cb 100644 --- a/profile/torch_d2f_profiler.py +++ b/profile/torch_d2f_profiler.py @@ -106,6 +106,13 @@ def main() -> None: parser.add_argument("--linear-attn-act-dtype", type=str, default="bf16") parser.add_argument("--linear-mlp-act-dtype", type=str, default="bf16") + # CUDA Graph + parser.add_argument( + "--use-cudagraph", + action="store_true", + help="启用 CUDA Graph(仅 decode_mode=static 且 shape 稳定时有意义);默认关闭以避免 capture 成本影响分析。", + ) + # Engine settings (force single-process profiling by default) parser.add_argument("--tensor-parallel-size", type=int, default=1, help="建议保持 1,否则会 spawn 子进程导致采集不到 CUDA") parser.add_argument("--data-parallel-size", type=int, default=1) @@ -171,7 +178,7 @@ def main() -> None: use_lora=use_lora, model_name="dream", decoding_strategy="d2f", - enforce_eager=True, + enforce_eager=not args.use_cudagraph, tensor_parallel_size=args.tensor_parallel_size, data_parallel_size=args.data_parallel_size, master_addr=args.master_addr, From 0d511452ea43738bd8b55e111a514301ae7aac58 Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Tue, 27 Jan 2026 04:25:22 +0000 Subject: [PATCH 09/10] Fix static+CUDA Graph mode and add benchmark configs - Fix tensor shape mismatch bug in static+CUDA Graph decode mode (model_runner.py) - Improve bucket selection logic for variable token counts - Add safety fallback when runtime batch exceeds captured capacity - Fix metadata buffer initialization and padding - Add new static mode benchmark configs: - awq_bf16kv_static.yml - gptq_marlin_w4_bf16kv_static.yml - gptq_marlin_w8_bf16kv_static.yml - Update quantization strategies and loader utilities - Update benchmark configurations for consistency --- diffulex/config.py | 3 +- diffulex/layer/linear.py | 9 ++- diffulex/strategy/d2f/engine/model_runner.py | 74 ++++++++++++------- diffulex/utils/loader.py | 16 ++++ .../strategies/linear_awq_w4a16.py | 38 ++++++++-- diffulex_bench/arg_parser.py | 7 ++ diffulex_bench/configs/awq_bf16kv_static.yml | 47 ++++++++++++ diffulex_bench/configs/awq_bf16kv_varlen.yml | 2 +- .../configs/awq_marlin_bf16kv_varlen.yml | 2 +- .../configs/bf16_bf16kv_distinct.yml | 2 +- diffulex_bench/configs/bf16_bf16kv_static.yml | 2 +- diffulex_bench/configs/bf16_bf16kv_varlen.yml | 2 +- .../configs/bf16_fp8kv_distinct.yml | 2 +- diffulex_bench/configs/bf16_fp8kv_static.yml | 2 +- diffulex_bench/configs/bf16_fp8kv_varlen.yml | 2 +- diffulex_bench/configs/dream_d2f_gsm8k.yml | 2 +- diffulex_bench/configs/example.yml | 2 +- diffulex_bench/configs/fp8_bf16kv_varlen.yml | 2 +- diffulex_bench/configs/gptq_bf16kv_varlen.yml | 2 +- .../configs/gptq_bf16kv_varlen_tp2.yml | 2 +- .../configs/gptq_marlin_bf16kv_varlen.yml | 2 +- .../configs/gptq_marlin_w4_bf16kv_static.yml | 47 ++++++++++++ .../configs/gptq_marlin_w4_bf16kv_varlen.yml | 2 +- .../configs/gptq_marlin_w8_bf16kv_static.yml | 47 ++++++++++++ .../configs/gptq_marlin_w8_bf16kv_varlen.yml | 2 +- .../configs/gptq_w2_bf16kv_varlen.yml | 2 +- .../configs/gptq_w8_bf16kv_varlen.yml | 2 +- .../configs/w4a16_bf16kv_static.yml | 2 +- .../configs/w4a16_bf16kv_varlen.yml | 2 +- diffulex_bench/configs/w4a16_fp8kv_static.yml | 2 +- diffulex_bench/configs/w4a16_fp8kv_varlen.yml | 2 +- diffulex_bench/configs/w4a8_bf16kv_static.yml | 2 +- diffulex_bench/configs/w4a8_bf16kv_varlen.yml | 2 +- diffulex_bench/configs/w4a8_fp8kv_static.yml | 2 +- diffulex_bench/configs/w4a8_fp8kv_varlen.yml | 2 +- .../configs/w8a16_bf16kv_static.yml | 2 +- .../configs/w8a16_bf16kv_varlen.yml | 2 +- diffulex_bench/configs/w8a16_fp8kv_static.yml | 2 +- diffulex_bench/configs/w8a16_fp8kv_varlen.yml | 2 +- diffulex_bench/configs/w8a8_bf16kv_static.yml | 2 +- diffulex_bench/configs/w8a8_bf16kv_varlen.yml | 2 +- diffulex_bench/configs/w8a8_fp8kv_static.yml | 2 +- diffulex_bench/configs/w8a8_fp8kv_varlen.yml | 2 +- diffulex_bench/main.py | 22 ++++++ diffulex_kernel/python/kv_cache_kernels.py | 14 ++++ diffulex_legacy/config.py | 2 +- 46 files changed, 322 insertions(+), 72 deletions(-) create mode 100644 diffulex_bench/configs/awq_bf16kv_static.yml create mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml create mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml diff --git a/diffulex/config.py b/diffulex/config.py index 1086223..99f6c50 100755 --- a/diffulex/config.py +++ b/diffulex/config.py @@ -32,7 +32,8 @@ class Config: # Distributed comm (per tensor-parallel group). When using multiple DP # replicas on one host, assign unique master_port per replica. master_addr: str = "localhost" - master_port: int = 2333 + # Allow overriding to avoid port collisions in multi-run/CI environments. + master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333")) # Shared memory segment name for intra-TP RPC; must be unique per DP group. shm_name: str = "diffulex_shm" # Start device index for this TP group (set by DP launcher). diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py index 9dbef0e..fd16eb3 100755 --- a/diffulex/layer/linear.py +++ b/diffulex/layer/linear.py @@ -432,7 +432,9 @@ def __call__(self, x: torch.Tensor) -> torch.Tensor: if not reshaped_x.is_contiguous(): reshaped_x = reshaped_x.contiguous() - out = self._awq_gemm(reshaped_x, self._qweight, self._qzeros, self._scales, self._split_k_iters) + # vLLM AWQ GEMM entrypoints (C++ op and Triton fallback) use the same order: + # awq_gemm(input, qweight, scales, qzeros, split_k_iters) + out = self._awq_gemm(reshaped_x, self._qweight, self._scales, self._qzeros, self._split_k_iters) if self._bias is not None: out.add_(self._bias.to(dtype=out.dtype)) out = out.reshape(x.shape[:-1] + (self._out_features,)) @@ -1978,8 +1980,9 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch. return F.linear(x, weight, bias) weight = getattr(self, "weight", None) - if weight is None: - raise RuntimeError("Strategy is configured but weight is missing (expected bf16 weight).") + # NOTE: For offline-quantized strategies (e.g. GPTQ/AWQ/Marlin), the original + # bf16 weight may be intentionally removed after loading to save memory. + # In that case, the quantization strategy must be able to run without it. kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x) if kwargs: return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs) diff --git a/diffulex/strategy/d2f/engine/model_runner.py b/diffulex/strategy/d2f/engine/model_runner.py index 839c848..9a020a9 100644 --- a/diffulex/strategy/d2f/engine/model_runner.py +++ b/diffulex/strategy/d2f/engine/model_runner.py @@ -294,30 +294,41 @@ def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill return self.model.compute_logits(self.model(input_ids, positions)) num_tokens = input_ids.size(0) context = fetch_d2f_attn_metadata() - bucket_tokens = next(x for x in self.graph_bs if x >= num_tokens) + candidates = [x for x in self.graph_bs if x >= num_tokens] + if not candidates: + # Safety: fall back if capture didn't include a large-enough bucket. + return self.model.compute_logits(self.model(input_ids, positions)) + bucket_tokens = candidates[0] graph = self.graphs[bucket_tokens] graph_vars = self.graph_vars - for key, value in graph_vars.items(): - if key != "outputs": - value.zero_() + # Safety: fall back if runtime batch exceeds captured metadata capacity. + num_seqs = int(context.context_lens.numel()) + max_num_seqs_for_graph = int(graph_vars["context_lens"].numel()) + if num_seqs > max_num_seqs_for_graph: + return self.model.compute_logits(self.model(input_ids, positions)) + + # Reset buffers to safe defaults (avoid "0" being interpreted as a valid index). + graph_vars["input_ids"].zero_() + graph_vars["positions"].zero_() + graph_vars["slot_mapping"].fill_(-1) + graph_vars["context_lens"].zero_() + graph_vars["block_tables"].fill_(-1) graph_vars["input_ids"][:num_tokens] = input_ids graph_vars["positions"][:num_tokens] = positions graph_vars["slot_mapping"][:num_tokens] = context.slot_mapping - num_seqs = int(context.context_lens.numel()) graph_vars["context_lens"][:num_seqs] = context.context_lens # cu_seqlens are required by unified paged-attn decode kernels. if getattr(context, "cu_seqlens_q", None) is not None: + # Pad to captured length so "extra" sequences become 0-length. + graph_vars["cu_seqlens_q"].fill_(int(num_tokens)) graph_vars["cu_seqlens_q"][: num_seqs + 1] = context.cu_seqlens_q - bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size))) - if bucket_num_seqs > num_seqs: - graph_vars["cu_seqlens_q"][num_seqs + 1 : bucket_num_seqs + 1].fill_(int(num_tokens)) if getattr(context, "cu_seqlens_k", None) is not None: + last_k = int(context.cu_seqlens_k[num_seqs].item()) + graph_vars["cu_seqlens_k"].fill_(last_k) graph_vars["cu_seqlens_k"][: num_seqs + 1] = context.cu_seqlens_k - bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size))) - if bucket_num_seqs > num_seqs: - last_k = context.cu_seqlens_k[num_seqs] - graph_vars["cu_seqlens_k"][num_seqs + 1 : bucket_num_seqs + 1] = last_k - graph_vars["block_tables"][:num_seqs, : context.block_tables.size(1)] = context.block_tables + + bt_cols = min(int(graph_vars["block_tables"].size(1)), int(context.block_tables.size(1))) + graph_vars["block_tables"][:num_seqs, :bt_cols] = context.block_tables[:, :bt_cols] graph.replay() return self.model.compute_logits(graph_vars["outputs"][:num_tokens]) @@ -355,8 +366,14 @@ def capture_cudagraph(self): diffusion_block_size = int(self.diffusion_block_size) max_num_seqs = int(self.config.max_num_seqs) # Graph path is only used when num_tokens <= 512. - max_num_seqs_for_graph = max(1, min(max_num_seqs, 512 // max(1, diffusion_block_size))) - max_num_tokens = max_num_seqs_for_graph * diffusion_block_size + # + # IMPORTANT: + # In D2F decode, `num_tokens` (sum of per-seq seqlen_q) is NOT guaranteed to equal + # `num_seqs * diffusion_block_size`. A single seq can contribute multiple diffusion blocks, + # so we must bucket by `num_tokens` directly and keep metadata tensors sized by + # `max_num_seqs_for_graph` (padding unused seqs to 0-length via cu_seqlens). + max_num_seqs_for_graph = max(1, min(max_num_seqs, 512)) + max_num_tokens = 512 max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size # Allocate graph buffers on the same device/dtype as the model. @@ -371,33 +388,38 @@ def capture_cudagraph(self): # Allocate max-size graph buffers. input_ids = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device) positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device) - slot_mapping = torch.zeros(max_num_tokens, dtype=torch.int32, device=graph_device) + slot_mapping = torch.full((max_num_tokens,), -1, dtype=torch.int32, device=graph_device) context_lens = torch.zeros(max_num_seqs_for_graph, dtype=torch.int32, device=graph_device) - block_tables = torch.zeros(max_num_seqs_for_graph, max_num_blocks, dtype=torch.int32, device=graph_device) + block_tables = torch.full((max_num_seqs_for_graph, max_num_blocks), -1, dtype=torch.int32, device=graph_device) outputs = torch.zeros(max_num_tokens, hf_config.hidden_size, dtype=graph_dtype, device=graph_device) cu_seqlens_q = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device) cu_seqlens_k = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device) - # Capture bucketed graphs by num_tokens (bucketed by num_seqs * diffusion_block_size). + # Capture bucketed graphs by total num_tokens. self.graph_bs = [] - seq_bs_list = [1, 2, 4, 8] + list(range(16, max_num_seqs_for_graph + 1, 16)) - for num_seqs in sorted(set([b for b in seq_bs_list if b <= max_num_seqs_for_graph] + [max_num_seqs_for_graph])): - self.graph_bs.append(int(num_seqs) * diffusion_block_size) + # Keep buckets aligned to diffusion_block_size for stable kernel shapes. + for t in range(diffusion_block_size, max_num_tokens + 1, diffusion_block_size): + self.graph_bs.append(int(t)) self.graphs = {} self.graph_pool = None for num_tokens in tqdm(reversed(self.graph_bs), desc="Capturing CUDA graphs"): - num_seqs = int(num_tokens // diffusion_block_size) + num_seqs = int(max_num_seqs_for_graph) graph = torch.cuda.CUDAGraph() # Fill placeholder metadata with valid monotonic cu_seqlens to satisfy kernel assertions. - cu_seqlens_q[: num_seqs + 1] = ( - torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * diffusion_block_size - ) + # IMPORTANT: cu_seqlens_q must be non-decreasing and end at `num_tokens` + # (it is used to index into Q/slot_mapping which are length `num_tokens`). + # Use a simple placeholder: put all Q tokens into the first seq and make + # the remaining seqs 0-length. + cu_seqlens_q[: num_seqs + 1].fill_(int(num_tokens)) + cu_seqlens_q[0] = 0 # Use a conservative max-seqlen for K to keep shapes stable; values are overwritten before replay. cu_seqlens_k[: num_seqs + 1] = ( torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * int(config.max_model_len) ) context_lens[:num_seqs].fill_(int(config.max_model_len)) + # Use a benign placeholder block table for the first seq. + block_tables[:1].zero_() # For static decode, use placeholder metadata tensors; per-step values are copied # into `graph_vars` before replay. set_d2f_attn_metadata( @@ -406,7 +428,7 @@ def capture_cudagraph(self): context_lens=context_lens[:num_seqs], cu_seqlens_q=cu_seqlens_q[: num_seqs + 1], cu_seqlens_k=cu_seqlens_k[: num_seqs + 1], - max_seqlen_q=diffusion_block_size, + max_seqlen_q=int(num_tokens), max_seqlen_k=int(config.max_model_len), block_tables=block_tables[:num_seqs], kv_cache_layout=self.config.kv_cache_layout, diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py index 73ffb92..b78f788 100755 --- a/diffulex/utils/loader.py +++ b/diffulex/utils/loader.py @@ -144,6 +144,22 @@ def _set_offline_gptq_marlin_weight( module._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device) module._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device) module._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device) + # Keep Python-side mirrors in sync; runtime fast paths rely on these and + # must not `.item()` from CUDA tensors (graph capture / perf). + if hasattr(module, "_offline_quant_format_py"): + module._offline_quant_format_py = 1 + if hasattr(module, "_offline_quant_bits_py"): + module._offline_quant_bits_py = int(bits) + if hasattr(module, "_offline_quant_group_size_py"): + module._offline_quant_group_size_py = int(group_size) + if hasattr(module, "_offline_quant_out_features_py"): + module._offline_quant_out_features_py = int(out_features) + if hasattr(module, "_offline_quant_in_features_py"): + module._offline_quant_in_features_py = int(in_features) + if hasattr(module, "_gptq_is_shuffled_py"): + module._gptq_is_shuffled_py = False + if hasattr(module, "_gptq_marlin_is_prepared_py"): + module._gptq_marlin_is_prepared_py = False # Reset marlin-prep caches (workspace/zp/g_idx meta will be created on first forward). module._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device) diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py index 7090d59..ea6675d 100644 --- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py +++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py @@ -23,6 +23,12 @@ except Exception: # pragma: no cover ops = None # type: ignore +try: + # Triton fallback path for AWQ GEMM (works even when C++/CUDA ops are not built). + from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton # type: ignore +except Exception: # pragma: no cover + awq_gemm_triton = None # type: ignore + @register_linear_strategy(weight_dtype="awq", act_dtype="bf16") def _build_linear_awq_w4a16() -> LinearQuantizationStrategy: @@ -32,17 +38,23 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy: class LinearAWQW4A16Strategy(LinearQuantizationStrategy): def __init__(self) -> None: super().__init__() - # Resolve the concrete kernel entry point once (avoid per-call dispatch). + # Resolve the concrete kernel entry points once (avoid per-call dispatch). + self._awq_gemm_cpp = None + self._awq_gemm_triton = awq_gemm_triton + awq_gemm = None try: if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"): awq_gemm = torch.ops._C.awq_gemm except Exception: awq_gemm = None - if awq_gemm is None and ops is not None and hasattr(ops, "awq_gemm"): - awq_gemm = ops.awq_gemm - self._awq_gemm = awq_gemm - self._ops_available: bool = bool(self._awq_gemm is not None) + # Prefer the real C++ op if present; otherwise keep `None` and fall back to Triton. + self._awq_gemm_cpp = awq_gemm + # Keep the python wrapper as a last resort (it may route to Triton or to torch.ops._C). + self._awq_gemm_py = ops.awq_gemm if (ops is not None and hasattr(ops, "awq_gemm")) else None + self._ops_available: bool = bool( + self._awq_gemm_cpp is not None or self._awq_gemm_triton is not None or self._awq_gemm_py is not None + ) @property def name(self) -> str: @@ -122,9 +134,21 @@ def linear_forward( reshaped_x = x_in.reshape(-1, x_in.shape[-1]) # Always use awq_gemm to avoid large temporary dequantized weight allocations. - # vLLM API: awq_gemm(input, qweight, qzeros, scales, split_k_iters) + # vLLM API: + # - C++ op: awq_gemm(input, qweight, scales, qzeros, split_k_iters) + # - Triton : awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters) split_k_iters = 1 - out = self._awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters) # type: ignore[misc] + if self._awq_gemm_triton is not None: + out = self._awq_gemm_triton(reshaped_x, qweight, scales, qzeros, split_k_iters) # type: ignore[misc] + elif self._awq_gemm_cpp is not None: + out = self._awq_gemm_cpp(reshaped_x, qweight, scales, qzeros, split_k_iters) # type: ignore[misc] + elif self._awq_gemm_py is not None: + out = self._awq_gemm_py(reshaped_x, qweight, scales, qzeros, split_k_iters) # type: ignore[misc] + else: + raise RuntimeError( + "vLLM is required for AWQ W4A16 but no available kernel entry point was found " + "(missing both Triton and C++ awq_gemm)." + ) if bias is not None: out.add_(bias.to(dtype=out.dtype)) diff --git a/diffulex_bench/arg_parser.py b/diffulex_bench/arg_parser.py index c0978ed..d4f786c 100644 --- a/diffulex_bench/arg_parser.py +++ b/diffulex_bench/arg_parser.py @@ -210,6 +210,13 @@ def create_argument_parser() -> argparse.ArgumentParser: action="store_true", help="Enforce eager mode (disable CUDA graphs)", ) + parser.add_argument( + "--no-enforce-eager", + dest="enforce_eager", + action="store_false", + help="Disable eager mode (enable CUDA graphs when supported)", + ) + parser.set_defaults(enforce_eager=None) parser.add_argument( "--kv-cache-layout", type=str, diff --git a/diffulex_bench/configs/awq_bf16kv_static.yml b/diffulex_bench/configs/awq_bf16kv_static.yml new file mode 100644 index 0000000..4cdb2fa --- /dev/null +++ b/diffulex_bench/configs/awq_bf16kv_static.yml @@ -0,0 +1,47 @@ +# AWQ (W4A16) + BF16 KV Cache (static mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-awq" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 4096 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "static" + linear_attn_weight_dtype: "awq" + linear_mlp_weight_dtype: "awq" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_static/awq_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/awq_bf16kv_varlen.yml b/diffulex_bench/configs/awq_bf16kv_varlen.yml index 62c2cb8..6ae2e46 100644 --- a/diffulex_bench/configs/awq_bf16kv_varlen.yml +++ b/diffulex_bench/configs/awq_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml index 8c76f4e..c27e4ec 100644 --- a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml +++ b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/bf16_bf16kv_distinct.yml b/diffulex_bench/configs/bf16_bf16kv_distinct.yml index 1800ef2..5cf750c 100644 --- a/diffulex_bench/configs/bf16_bf16kv_distinct.yml +++ b/diffulex_bench/configs/bf16_bf16kv_distinct.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/bf16_bf16kv_static.yml b/diffulex_bench/configs/bf16_bf16kv_static.yml index c83e028..d36e39d 100644 --- a/diffulex_bench/configs/bf16_bf16kv_static.yml +++ b/diffulex_bench/configs/bf16_bf16kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/bf16_bf16kv_varlen.yml b/diffulex_bench/configs/bf16_bf16kv_varlen.yml index 4a6b794..8258035 100644 --- a/diffulex_bench/configs/bf16_bf16kv_varlen.yml +++ b/diffulex_bench/configs/bf16_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/bf16_fp8kv_distinct.yml b/diffulex_bench/configs/bf16_fp8kv_distinct.yml index 4cbbb8e..bc0fdd5 100644 --- a/diffulex_bench/configs/bf16_fp8kv_distinct.yml +++ b/diffulex_bench/configs/bf16_fp8kv_distinct.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/bf16_fp8kv_static.yml b/diffulex_bench/configs/bf16_fp8kv_static.yml index ff429df..ee0af7f 100644 --- a/diffulex_bench/configs/bf16_fp8kv_static.yml +++ b/diffulex_bench/configs/bf16_fp8kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/bf16_fp8kv_varlen.yml b/diffulex_bench/configs/bf16_fp8kv_varlen.yml index bcfbc9f..973ec91 100644 --- a/diffulex_bench/configs/bf16_fp8kv_varlen.yml +++ b/diffulex_bench/configs/bf16_fp8kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/dream_d2f_gsm8k.yml b/diffulex_bench/configs/dream_d2f_gsm8k.yml index e55b9be..74d1b07 100644 --- a/diffulex_bench/configs/dream_d2f_gsm8k.yml +++ b/diffulex_bench/configs/dream_d2f_gsm8k.yml @@ -10,7 +10,7 @@ engine: tensor_parallel_size: 1 data_parallel_size: 1 gpu_memory_utilization: 0.9 - max_model_len: 2048 + max_model_len: 4096 use_lora: false enforce_eager: false diff --git a/diffulex_bench/configs/example.yml b/diffulex_bench/configs/example.yml index 41f0839..bbdcbc5 100644 --- a/diffulex_bench/configs/example.yml +++ b/diffulex_bench/configs/example.yml @@ -20,7 +20,7 @@ engine: # Memory and capacity configuration gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/fp8_bf16kv_varlen.yml b/diffulex_bench/configs/fp8_bf16kv_varlen.yml index 2ac105b..f6fb081 100644 --- a/diffulex_bench/configs/fp8_bf16kv_varlen.yml +++ b/diffulex_bench/configs/fp8_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_bf16kv_varlen.yml index b7fd14d..3ff8759 100644 --- a/diffulex_bench/configs/gptq_bf16kv_varlen.yml +++ b/diffulex_bench/configs/gptq_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml index 1505192..4eb16cd 100644 --- a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml +++ b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml index 858b31a..06d9733 100644 --- a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml +++ b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml new file mode 100644 index 0000000..8ba23c3 --- /dev/null +++ b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml @@ -0,0 +1,47 @@ +# GPTQ Marlin (W4, A16) + BF16 KV Cache (static mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 4096 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: false # Enable CUDA Graph for static mode + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "static" + linear_attn_weight_dtype: "gptq_marlin" + linear_mlp_weight_dtype: "gptq_marlin" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_static/gptq_marlin_w4_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml index f8265d3..3702baf 100644 --- a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml +++ b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml new file mode 100644 index 0000000..06bb08b --- /dev/null +++ b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml @@ -0,0 +1,47 @@ +# GPTQ Marlin (W8, A16) + BF16 KV Cache (static mode) +engine: + model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8" + tokenizer_path: null + model_name: "dream" + decoding_strategy: "d2f" + mask_token_id: 151666 + + use_lora: false + lora_path: "" + + tensor_parallel_size: 1 + data_parallel_size: 1 + + gpu_memory_utilization: 0.7 + max_model_len: 4096 + max_num_batched_tokens: 4096 + max_num_seqs: 128 + + enforce_eager: true + kv_cache_layout: "unified" + + accept_threshold: 0.9 + complete_threshold: 0.95 + add_new_block_threshold: 0.1 + diffusion_block_size: 32 + + # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache + kv_cache_dtype: "bf16" + decode_mode: "static" + linear_attn_weight_dtype: "gptq_marlin" + linear_mlp_weight_dtype: "gptq_marlin" + linear_attn_act_dtype: "bf16" + linear_mlp_act_dtype: "bf16" + +eval: + dataset_name: "gsm8k" + dataset_split: "test" + dataset_limit: 10 + + temperature: 0.0 + max_tokens: 512 + ignore_eos: false + + output_dir: "benchmark_results_static/gptq_marlin_w8_bf16kv" + save_results: true + use_tqdm: true diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml index e20c9be..da2cfdc 100644 --- a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml +++ b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml index 03fe3e7..0e60faa 100644 --- a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml +++ b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml index 1f68616..b1bf8ad 100644 --- a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml +++ b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a16_bf16kv_static.yml b/diffulex_bench/configs/w4a16_bf16kv_static.yml index 79d9825..c8a2d95 100644 --- a/diffulex_bench/configs/w4a16_bf16kv_static.yml +++ b/diffulex_bench/configs/w4a16_bf16kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml b/diffulex_bench/configs/w4a16_bf16kv_varlen.yml index 52230fc..609dd3d 100644 --- a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml +++ b/diffulex_bench/configs/w4a16_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a16_fp8kv_static.yml b/diffulex_bench/configs/w4a16_fp8kv_static.yml index 22225a1..8f707a6 100644 --- a/diffulex_bench/configs/w4a16_fp8kv_static.yml +++ b/diffulex_bench/configs/w4a16_fp8kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml b/diffulex_bench/configs/w4a16_fp8kv_varlen.yml index c1b943f..bf7381b 100644 --- a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml +++ b/diffulex_bench/configs/w4a16_fp8kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a8_bf16kv_static.yml b/diffulex_bench/configs/w4a8_bf16kv_static.yml index 841050e..4741aa5 100644 --- a/diffulex_bench/configs/w4a8_bf16kv_static.yml +++ b/diffulex_bench/configs/w4a8_bf16kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml b/diffulex_bench/configs/w4a8_bf16kv_varlen.yml index 4df0089..8ce0145 100644 --- a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml +++ b/diffulex_bench/configs/w4a8_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a8_fp8kv_static.yml b/diffulex_bench/configs/w4a8_fp8kv_static.yml index 1676393..08da846 100644 --- a/diffulex_bench/configs/w4a8_fp8kv_static.yml +++ b/diffulex_bench/configs/w4a8_fp8kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml b/diffulex_bench/configs/w4a8_fp8kv_varlen.yml index 4725d6a..8dd80ec 100644 --- a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml +++ b/diffulex_bench/configs/w4a8_fp8kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w8a16_bf16kv_static.yml b/diffulex_bench/configs/w8a16_bf16kv_static.yml index 9ba90fb..7f54b1c 100644 --- a/diffulex_bench/configs/w8a16_bf16kv_static.yml +++ b/diffulex_bench/configs/w8a16_bf16kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml b/diffulex_bench/configs/w8a16_bf16kv_varlen.yml index 4b50d5f..9c0efaa 100644 --- a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml +++ b/diffulex_bench/configs/w8a16_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w8a16_fp8kv_static.yml b/diffulex_bench/configs/w8a16_fp8kv_static.yml index 9771043..27243b9 100644 --- a/diffulex_bench/configs/w8a16_fp8kv_static.yml +++ b/diffulex_bench/configs/w8a16_fp8kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml b/diffulex_bench/configs/w8a16_fp8kv_varlen.yml index e282a27..ddd04ab 100644 --- a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml +++ b/diffulex_bench/configs/w8a16_fp8kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w8a8_bf16kv_static.yml b/diffulex_bench/configs/w8a8_bf16kv_static.yml index bd9753d..e34456c 100644 --- a/diffulex_bench/configs/w8a8_bf16kv_static.yml +++ b/diffulex_bench/configs/w8a8_bf16kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml b/diffulex_bench/configs/w8a8_bf16kv_varlen.yml index e1d9ecb..57e919b 100644 --- a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml +++ b/diffulex_bench/configs/w8a8_bf16kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.5 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 2048 max_num_seqs: 64 diff --git a/diffulex_bench/configs/w8a8_fp8kv_static.yml b/diffulex_bench/configs/w8a8_fp8kv_static.yml index 30f71ca..da5b9c6 100644 --- a/diffulex_bench/configs/w8a8_fp8kv_static.yml +++ b/diffulex_bench/configs/w8a8_fp8kv_static.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml b/diffulex_bench/configs/w8a8_fp8kv_varlen.yml index 0467144..1ae985b 100644 --- a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml +++ b/diffulex_bench/configs/w8a8_fp8kv_varlen.yml @@ -13,7 +13,7 @@ engine: data_parallel_size: 1 gpu_memory_utilization: 0.7 - max_model_len: 2048 + max_model_len: 4096 max_num_batched_tokens: 4096 max_num_seqs: 128 diff --git a/diffulex_bench/main.py b/diffulex_bench/main.py index 15bac16..f6a7ae8 100644 --- a/diffulex_bench/main.py +++ b/diffulex_bench/main.py @@ -206,12 +206,34 @@ def load_config_from_args(args) -> BenchmarkConfig: # Override with command line arguments if provided if args.model_path: config.engine.model_path = args.model_path + if getattr(args, "tokenizer_path", None): + config.engine.tokenizer_path = args.tokenizer_path if args.dataset: config.eval.dataset_name = args.dataset if args.dataset_limit is not None: config.eval.dataset_limit = args.dataset_limit + if getattr(args, "max_tokens", None) is not None: + config.eval.max_tokens = args.max_tokens + if getattr(args, "temperature", None) is not None: + config.eval.temperature = args.temperature if args.output_dir: config.eval.output_dir = args.output_dir + + # Engine overrides (make bench configs reusable for eager vs CUDA Graph comparisons) + if getattr(args, "enforce_eager", None) is not None: + config.engine.enforce_eager = bool(args.enforce_eager) + if getattr(args, "kv_cache_layout", None) is not None: + config.engine.kv_cache_layout = args.kv_cache_layout + if getattr(args, "decode_mode", None) is not None: + config.engine.decode_mode = args.decode_mode + if getattr(args, "kv_cache_dtype", None) is not None: + config.engine.kv_cache_dtype = args.kv_cache_dtype + if getattr(args, "max_model_len", None) is not None: + config.engine.max_model_len = args.max_model_len + if getattr(args, "max_num_seqs", None) is not None: + config.engine.max_num_seqs = args.max_num_seqs + if getattr(args, "max_num_batched_tokens", None) is not None: + config.engine.max_num_batched_tokens = args.max_num_batched_tokens else: if not args.model_path: logger.error("Either --config or --model-path must be provided") diff --git a/diffulex_kernel/python/kv_cache_kernels.py b/diffulex_kernel/python/kv_cache_kernels.py index 514c8fe..8010042 100755 --- a/diffulex_kernel/python/kv_cache_kernels.py +++ b/diffulex_kernel/python/kv_cache_kernels.py @@ -880,6 +880,20 @@ def store_kvcache_unified_layout(key: torch.Tensor, value: torch.Tensor, Store KV cache (unified layout). Dynamically selects the appropriate kernel based on quantization strategy from context. """ + # `slot_mapping` is expected to have one entry per token in `key/value` (dimension 0). + # In some flows (e.g. prefix-cache / partial-prefill), metadata may carry a longer + # mapping for the full sequence while `key/value` only contain the suffix tokens + # actually computed this step. In that case, align by taking the tail. + N = int(key.shape[0]) + if int(slot_mapping.numel()) != N: + if int(slot_mapping.numel()) > N: + slot_mapping = slot_mapping[-N:] + else: + raise AssertionError( + f"slot_mapping is shorter than key/value tokens: " + f"N={N}, slot_mapping.numel()={int(slot_mapping.numel())}" + ) + from diffulex.utils.quantization.context import get_kv_cache_strategy strategy = get_kv_cache_strategy() if strategy is None: diff --git a/diffulex_legacy/config.py b/diffulex_legacy/config.py index a5b1dd6..bd4ec71 100755 --- a/diffulex_legacy/config.py +++ b/diffulex_legacy/config.py @@ -29,7 +29,7 @@ class Config: # Distributed comm (per tensor-parallel group). When using multiple DP # replicas on one host, assign unique master_port per replica. master_addr: str = "localhost" - master_port: int = 2333 + master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333")) # Shared memory segment name for intra-TP RPC; must be unique per DP group. shm_name: str = "d2f_vllm" # Start device index for this TP group (set by DP launcher). From 8ea87175ea7f92a337672a790fd99e9751d52b2a Mon Sep 17 00:00:00 2001 From: luozixin2 Date: Wed, 28 Jan 2026 02:32:52 +0000 Subject: [PATCH 10/10] =?UTF-8?q?chore:=20=E6=B8=85=E7=90=86=E5=AE=9E?= =?UTF-8?q?=E9=AA=8C=E9=85=8D=E7=BD=AE=E4=B8=8E=E7=8E=AF=E5=A2=83=E5=8F=98?= =?UTF-8?q?=E9=87=8F=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 移除 v0.0.1 之后新增的 bench 配置与量化架构文档 - 将 W8A16/DP 等调参从 env 收敛到 Config/strategy.configure - 示例/脚本去掉硬编码本机路径与默认 GPU,并修复语法问题 --- .gitignore | 1 + diffulex/config.py | 8 +- diffulex/engine/dp_worker.py | 4 +- diffulex/utils/quantization/factory.py | 3 + .../strategies/linear_marlin_int8_w8a16.py | 33 +- diffulex/utils/quantization/strategy.py | 10 + diffulex_bench/configs/awq_bf16kv_static.yml | 47 -- diffulex_bench/configs/awq_bf16kv_varlen.yml | 47 -- .../configs/awq_marlin_bf16kv_varlen.yml | 48 -- .../configs/bf16_bf16kv_distinct.yml | 47 -- diffulex_bench/configs/bf16_bf16kv_static.yml | 47 -- diffulex_bench/configs/bf16_bf16kv_varlen.yml | 47 -- .../configs/bf16_fp8kv_distinct.yml | 47 -- diffulex_bench/configs/bf16_fp8kv_static.yml | 47 -- diffulex_bench/configs/bf16_fp8kv_varlen.yml | 47 -- diffulex_bench/configs/fp8_bf16kv_varlen.yml | 48 -- diffulex_bench/configs/gptq_bf16kv_varlen.yml | 47 -- .../configs/gptq_bf16kv_varlen_tp2.yml | 47 -- .../configs/gptq_marlin_bf16kv_varlen.yml | 48 -- .../configs/gptq_marlin_w4_bf16kv_static.yml | 47 -- .../configs/gptq_marlin_w4_bf16kv_varlen.yml | 47 -- .../configs/gptq_marlin_w8_bf16kv_static.yml | 47 -- .../configs/gptq_marlin_w8_bf16kv_varlen.yml | 47 -- .../configs/gptq_w2_bf16kv_varlen.yml | 47 -- .../configs/gptq_w8_bf16kv_varlen.yml | 47 -- .../configs/w4a16_bf16kv_static.yml | 47 -- .../configs/w4a16_bf16kv_varlen.yml | 47 -- diffulex_bench/configs/w4a16_fp8kv_static.yml | 47 -- diffulex_bench/configs/w4a16_fp8kv_varlen.yml | 47 -- diffulex_bench/configs/w4a8_bf16kv_static.yml | 47 -- diffulex_bench/configs/w4a8_bf16kv_varlen.yml | 47 -- diffulex_bench/configs/w4a8_fp8kv_static.yml | 47 -- diffulex_bench/configs/w4a8_fp8kv_varlen.yml | 47 -- .../configs/w8a16_bf16kv_static.yml | 47 -- .../configs/w8a16_bf16kv_varlen.yml | 47 -- diffulex_bench/configs/w8a16_fp8kv_static.yml | 47 -- diffulex_bench/configs/w8a16_fp8kv_varlen.yml | 47 -- diffulex_bench/configs/w8a8_bf16kv_static.yml | 47 -- diffulex_bench/configs/w8a8_bf16kv_varlen.yml | 47 -- diffulex_bench/configs/w8a8_fp8kv_static.yml | 47 -- diffulex_bench/configs/w8a8_fp8kv_varlen.yml | 47 -- diffulex_legacy/config.py | 2 +- diffulex_legacy/engine/dp_engine.py | 4 +- diffulex_profiler/example.py | 11 +- examples/test_dream_diffulex_gsm8k.py | 3 +- examples/test_dream_dvllm_human_eval.py | 3 +- examples/test_fastdllmv2_diffulex_gsm8k.py | 3 +- examples/test_gptq_awq_loading.py | 17 - examples/test_llada_dvllm_human_eval.py | 3 +- examples/test_quantization_generation.py | 36 +- examples/test_sdar_diffulex_gsm8k.py | 3 +- examples/test_sdar_dvllm.py | 4 +- profile/torch_d2f_profiler.py | 30 +- quantization_architecture.md | 149 ----- quantization_architecture_diagram.md | 551 ------------------ 55 files changed, 93 insertions(+), 2433 deletions(-) delete mode 100644 diffulex_bench/configs/awq_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/awq_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/bf16_bf16kv_distinct.yml delete mode 100644 diffulex_bench/configs/bf16_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/bf16_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/bf16_fp8kv_distinct.yml delete mode 100644 diffulex_bench/configs/bf16_fp8kv_static.yml delete mode 100644 diffulex_bench/configs/bf16_fp8kv_varlen.yml delete mode 100644 diffulex_bench/configs/fp8_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml delete mode 100644 diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/w4a16_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/w4a16_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/w4a16_fp8kv_static.yml delete mode 100644 diffulex_bench/configs/w4a16_fp8kv_varlen.yml delete mode 100644 diffulex_bench/configs/w4a8_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/w4a8_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/w4a8_fp8kv_static.yml delete mode 100644 diffulex_bench/configs/w4a8_fp8kv_varlen.yml delete mode 100644 diffulex_bench/configs/w8a16_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/w8a16_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/w8a16_fp8kv_static.yml delete mode 100644 diffulex_bench/configs/w8a16_fp8kv_varlen.yml delete mode 100644 diffulex_bench/configs/w8a8_bf16kv_static.yml delete mode 100644 diffulex_bench/configs/w8a8_bf16kv_varlen.yml delete mode 100644 diffulex_bench/configs/w8a8_fp8kv_static.yml delete mode 100644 diffulex_bench/configs/w8a8_fp8kv_varlen.yml delete mode 100644 quantization_architecture.md delete mode 100644 quantization_architecture_diagram.md diff --git a/.gitignore b/.gitignore index 0a8ab01..76f8e70 100755 --- a/.gitignore +++ b/.gitignore @@ -54,5 +54,6 @@ GITHUB_ISSUE.md Tilelang-failed_test_cases/ # Benchmark results benchmark_results/ +benchmark_results_tmp/ # Cursor IDE files .cursor/ diff --git a/diffulex/config.py b/diffulex/config.py index 99f6c50..f571f34 100755 --- a/diffulex/config.py +++ b/diffulex/config.py @@ -32,8 +32,7 @@ class Config: # Distributed comm (per tensor-parallel group). When using multiple DP # replicas on one host, assign unique master_port per replica. master_addr: str = "localhost" - # Allow overriding to avoid port collisions in multi-run/CI environments. - master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333")) + master_port: int = 2333 # Shared memory segment name for intra-TP RPC; must be unique per DP group. shm_name: str = "diffulex_shm" # Start device index for this TP group (set by DP launcher). @@ -60,6 +59,11 @@ class Config: linear_attn_act_dtype: str = "bf16" linear_mlp_act_dtype: str = "bf16" + # Kernel tuning knobs (avoid environment-variable based tuning in library code). + # Currently used by some W8A16 linear strategies. + linear_w8a16_quant_block_n: int = 256 + linear_w8a16_allspark_cublas_m_threshold: int = 256 + def __post_init__(self): assert os.path.isdir(self.model) assert self.kvcache_block_size % 16 == 0 diff --git a/diffulex/engine/dp_worker.py b/diffulex/engine/dp_worker.py index a76239a..968fa5f 100755 --- a/diffulex/engine/dp_worker.py +++ b/diffulex/engine/dp_worker.py @@ -125,12 +125,10 @@ def __init__(self, model, **kwargs): need_gpus = self.dp_size * cfg.tensor_parallel_size assert len(vis) >= need_gpus, f"Require {need_gpus} GPUs (dp={self.dp_size}, tp={cfg.tensor_parallel_size}), visible {len(vis)}" - # Optional overrides: kwargs['device_ids'] or env D2F_DEVICE_MAP + # Optional overrides: kwargs['device_ids'] override = None if 'device_ids' in kwargs and kwargs['device_ids']: override = list(kwargs['device_ids']) - elif os.environ.get('D2F_DEVICE_MAP'): - override = [int(x) for x in os.environ['D2F_DEVICE_MAP'].split(',') if x.strip() != ''] if override is not None: assert len(override) >= need_gpus, f"device_ids length {len(override)} < required {need_gpus}" # All override devices must be in visible list diff --git a/diffulex/utils/quantization/factory.py b/diffulex/utils/quantization/factory.py index 3b32f96..ee7e3b6 100644 --- a/diffulex/utils/quantization/factory.py +++ b/diffulex/utils/quantization/factory.py @@ -60,6 +60,7 @@ def create_from_config(config) -> QuantizationContext: # KV Cache strategy strategy = QuantizationStrategyFactory.create_kv_cache_strategy(quant_cfg.kv_cache.dtype) + strategy.configure(diffulex_config=config) ctx.set_strategy('kv_cache', strategy) # Linear strategies (weights + activations) by kind @@ -67,12 +68,14 @@ def create_from_config(config) -> QuantizationContext: weight_dtype=quant_cfg.weights.linear_attn_dtype, act_dtype=quant_cfg.activations.linear_attn_dtype, ) + linear_attn.configure(diffulex_config=config) ctx.set_linear_strategy("attn", linear_attn) linear_mlp = _create_linear_strategy( weight_dtype=quant_cfg.weights.linear_mlp_dtype, act_dtype=quant_cfg.activations.linear_mlp_dtype, ) + linear_mlp.configure(diffulex_config=config) ctx.set_linear_strategy("mlp", linear_mlp) # Future: Weight strategy diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py index c2ff1ce..ceb3630 100644 --- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py +++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py @@ -13,7 +13,6 @@ from __future__ import annotations -import os from typing import Any, Optional import torch @@ -62,10 +61,26 @@ def __init__(self) -> None: self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {} # Cache device info and thresholds to reduce per-call CPU overhead. self._sm_info_cache: dict[int, tuple[int, int]] = {} - self._cublas_m_thr: int = self._cublas_m_threshold() + self._quant_block_n: int = 256 + self._cublas_m_thr: int = 256 # One-time availability check (avoid calling `_allspark_is_available()` on every linear). self._allspark_available: bool = _allspark_is_available() + def configure(self, *, diffulex_config: Any | None = None) -> None: + # Prefer explicit config fields over environment-variable based tuning. + if diffulex_config is None: + return + try: + bn = int(getattr(diffulex_config, "linear_w8a16_quant_block_n", self._quant_block_n)) + self._quant_block_n = max(1, bn) + except Exception: + pass + try: + thr = int(getattr(diffulex_config, "linear_w8a16_allspark_cublas_m_threshold", self._cublas_m_thr)) + self._cublas_m_thr = max(1, thr) + except Exception: + pass + @property def name(self) -> str: # NOTE: Keep strategy naming consistent with the public W8A16 INT8 path. @@ -158,11 +173,7 @@ def quantize_weight_for_kernel( # Avoid allocating a full [N,K] fp32 copy (and an extra transpose buffer). # Quantize in small row blocks and (when using AllSpark) write directly into # the repack input layout B_kn=[K,N], so we never materialize q_u8 + transpose. - try: - block_n = int(os.getenv("DIFFULEX_W8A16_QUANT_BLOCK_N", "256")) - except Exception: - block_n = 256 - block_n = max(1, block_n) + block_n = max(1, int(self._quant_block_n)) if self._allspark_available: # AllSpark repack expects B in (K,N) contiguous layout. @@ -234,14 +245,6 @@ def _get_sm_info(self, device: torch.device) -> tuple[int, int]: self._sm_info_cache[idx] = (0, 0) return 0, 0 - def _cublas_m_threshold(self) -> int: - # For decode, M is typically small, so AllSpark custom kernel is preferred. - # For large-M prefill, AllSpark falls back to a dequant+cuBLAS path if M > threshold. - try: - return int(os.getenv("DIFFULEX_ALLSPARK_CUBLAS_M_THRESHOLD", "256")) - except Exception: - return 256 - def linear_forward( self, x: torch.Tensor, diff --git a/diffulex/utils/quantization/strategy.py b/diffulex/utils/quantization/strategy.py index a36e553..7c3b01a 100644 --- a/diffulex/utils/quantization/strategy.py +++ b/diffulex/utils/quantization/strategy.py @@ -84,6 +84,16 @@ def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[in """ pass + def configure(self, *, diffulex_config: Any | None = None) -> None: + """Optional hook to configure a strategy from Diffulex `Config`. + + We intentionally keep this a no-op by default to avoid forcing configuration + plumbing through every call site. Strategy-specific tuning knobs should be + surfaced via explicit fields on `diffulex.config.Config`, not environment variables. + """ + _ = diffulex_config + return + # ---- Optional capability flags / helpers (non-abstract) ---- # These helpers are used to avoid hard-coding isinstance(...) checks in the runtime. @property diff --git a/diffulex_bench/configs/awq_bf16kv_static.yml b/diffulex_bench/configs/awq_bf16kv_static.yml deleted file mode 100644 index 4cdb2fa..0000000 --- a/diffulex_bench/configs/awq_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# AWQ (W4A16) + BF16 KV Cache (static mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-awq" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "awq" - linear_mlp_weight_dtype: "awq" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/awq_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/awq_bf16kv_varlen.yml b/diffulex_bench/configs/awq_bf16kv_varlen.yml deleted file mode 100644 index 6ae2e46..0000000 --- a/diffulex_bench/configs/awq_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# AWQ (W4A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-awq" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "awq" - linear_mlp_weight_dtype: "awq" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/awq_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml deleted file mode 100644 index c27e4ec..0000000 --- a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml +++ /dev/null @@ -1,48 +0,0 @@ -# AWQ Marlin (W4, A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-awq" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: AWQ Marlin + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "awq_marlin" - linear_mlp_weight_dtype: "awq_marlin" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/awq_marlin_bf16kv" - save_results: true - use_tqdm: true - diff --git a/diffulex_bench/configs/bf16_bf16kv_distinct.yml b/diffulex_bench/configs/bf16_bf16kv_distinct.yml deleted file mode 100644 index 5cf750c..0000000 --- a/diffulex_bench/configs/bf16_bf16kv_distinct.yml +++ /dev/null @@ -1,47 +0,0 @@ -# BF16 + BF16 KV Cache (distinct layout) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "distinct" # Test distinct layout - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: BF16 weights + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "bf16" - linear_mlp_weight_dtype: "bf16" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 # 10 samples for testing - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_distinct/bf16_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/bf16_bf16kv_static.yml b/diffulex_bench/configs/bf16_bf16kv_static.yml deleted file mode 100644 index d36e39d..0000000 --- a/diffulex_bench/configs/bf16_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# BF16 + BF16 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: BF16 weights + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "bf16" - linear_mlp_weight_dtype: "bf16" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/bf16_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/bf16_bf16kv_varlen.yml b/diffulex_bench/configs/bf16_bf16kv_varlen.yml deleted file mode 100644 index 8258035..0000000 --- a/diffulex_bench/configs/bf16_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# BF16 + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: BF16 weights + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "bf16" - linear_mlp_weight_dtype: "bf16" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/bf16_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/bf16_fp8kv_distinct.yml b/diffulex_bench/configs/bf16_fp8kv_distinct.yml deleted file mode 100644 index bc0fdd5..0000000 --- a/diffulex_bench/configs/bf16_fp8kv_distinct.yml +++ /dev/null @@ -1,47 +0,0 @@ -# BF16 + FP8 KV Cache (distinct layout) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "distinct" # Test distinct layout - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: BF16 weights + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "static" - linear_attn_weight_dtype: "bf16" - linear_mlp_weight_dtype: "bf16" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 # 10 samples for testing - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_distinct/bf16_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/bf16_fp8kv_static.yml b/diffulex_bench/configs/bf16_fp8kv_static.yml deleted file mode 100644 index ee0af7f..0000000 --- a/diffulex_bench/configs/bf16_fp8kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# BF16 + FP8 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: BF16 weights + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "static" - linear_attn_weight_dtype: "bf16" - linear_mlp_weight_dtype: "bf16" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/bf16_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/bf16_fp8kv_varlen.yml b/diffulex_bench/configs/bf16_fp8kv_varlen.yml deleted file mode 100644 index 973ec91..0000000 --- a/diffulex_bench/configs/bf16_fp8kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# BF16 + FP8 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: BF16 weights + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "varlen" - linear_attn_weight_dtype: "bf16" - linear_mlp_weight_dtype: "bf16" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/bf16_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/fp8_bf16kv_varlen.yml b/diffulex_bench/configs/fp8_bf16kv_varlen.yml deleted file mode 100644 index f6fb081..0000000 --- a/diffulex_bench/configs/fp8_bf16kv_varlen.yml +++ /dev/null @@ -1,48 +0,0 @@ -# FP8 Linear (vLLM) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: FP8 weights (vLLM ops) + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "fp8" - linear_mlp_weight_dtype: "fp8" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/fp8_bf16kv" - save_results: true - use_tqdm: true - diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_bf16kv_varlen.yml deleted file mode 100644 index 3ff8759..0000000 --- a/diffulex_bench/configs/gptq_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ (W4A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq" - linear_mlp_weight_dtype: "gptq" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_bf16kv" - save_results: true - use_tqdm: true \ No newline at end of file diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml deleted file mode 100644 index 4eb16cd..0000000 --- a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ (W4A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 2 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq" - linear_mlp_weight_dtype: "gptq" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_bf16kv" - save_results: true - use_tqdm: true \ No newline at end of file diff --git a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml deleted file mode 100644 index 06d9733..0000000 --- a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml +++ /dev/null @@ -1,48 +0,0 @@ -# GPTQ Marlin (W4/W8, A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ Marlin + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq_marlin" - linear_mlp_weight_dtype: "gptq_marlin" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_marlin_bf16kv" - save_results: true - use_tqdm: true - diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml deleted file mode 100644 index 8ba23c3..0000000 --- a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ Marlin (W4, A16) + BF16 KV Cache (static mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: false # Enable CUDA Graph for static mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "gptq_marlin" - linear_mlp_weight_dtype: "gptq_marlin" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/gptq_marlin_w4_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml deleted file mode 100644 index 3702baf..0000000 --- a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ Marlin (W4, A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq_marlin" - linear_mlp_weight_dtype: "gptq_marlin" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_marlin_w4_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml deleted file mode 100644 index 06bb08b..0000000 --- a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ Marlin (W8, A16) + BF16 KV Cache (static mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "gptq_marlin" - linear_mlp_weight_dtype: "gptq_marlin" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/gptq_marlin_w8_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml deleted file mode 100644 index da2cfdc..0000000 --- a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ Marlin (W8, A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq_marlin" - linear_mlp_weight_dtype: "gptq_marlin" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_marlin_w8_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml deleted file mode 100644 index 0e60faa..0000000 --- a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ (W2A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ (W2A16) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq" - linear_mlp_weight_dtype: "gptq" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_w2_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml deleted file mode 100644 index b1bf8ad..0000000 --- a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# GPTQ (W8A16) + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w8" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: GPTQ (W8A16) + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "gptq" - linear_mlp_weight_dtype: "gptq" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/gptq_w8_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a16_bf16kv_static.yml b/diffulex_bench/configs/w4a16_bf16kv_static.yml deleted file mode 100644 index c8a2d95..0000000 --- a/diffulex_bench/configs/w4a16_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A16 + BF16 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w4a16_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml b/diffulex_bench/configs/w4a16_bf16kv_varlen.yml deleted file mode 100644 index 609dd3d..0000000 --- a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A16 + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w4a16_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a16_fp8kv_static.yml b/diffulex_bench/configs/w4a16_fp8kv_static.yml deleted file mode 100644 index 8f707a6..0000000 --- a/diffulex_bench/configs/w4a16_fp8kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A16 + FP8 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + BF16 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "static" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w4a16_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml b/diffulex_bench/configs/w4a16_fp8kv_varlen.yml deleted file mode 100644 index bf7381b..0000000 --- a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A16 + FP8 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + BF16 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "varlen" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w4a16_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a8_bf16kv_static.yml b/diffulex_bench/configs/w4a8_bf16kv_static.yml deleted file mode 100644 index 4741aa5..0000000 --- a/diffulex_bench/configs/w4a8_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A8 + BF16 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + INT8 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w4a8_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml b/diffulex_bench/configs/w4a8_bf16kv_varlen.yml deleted file mode 100644 index 8ce0145..0000000 --- a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A8 + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + INT8 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w4a8_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a8_fp8kv_static.yml b/diffulex_bench/configs/w4a8_fp8kv_static.yml deleted file mode 100644 index 08da846..0000000 --- a/diffulex_bench/configs/w4a8_fp8kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A8 + FP8 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + INT8 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "static" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w4a8_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml b/diffulex_bench/configs/w4a8_fp8kv_varlen.yml deleted file mode 100644 index 8dd80ec..0000000 --- a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W4A8 + FP8 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT4 weights + INT8 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "varlen" - linear_attn_weight_dtype: "int4" - linear_mlp_weight_dtype: "int4" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w4a8_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a16_bf16kv_static.yml b/diffulex_bench/configs/w8a16_bf16kv_static.yml deleted file mode 100644 index 7f54b1c..0000000 --- a/diffulex_bench/configs/w8a16_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A16 + BF16 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w8a16_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml b/diffulex_bench/configs/w8a16_bf16kv_varlen.yml deleted file mode 100644 index 9c0efaa..0000000 --- a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A16 + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + BF16 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w8a16_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a16_fp8kv_static.yml b/diffulex_bench/configs/w8a16_fp8kv_static.yml deleted file mode 100644 index 27243b9..0000000 --- a/diffulex_bench/configs/w8a16_fp8kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A16 + FP8 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + BF16 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "static" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w8a16_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml b/diffulex_bench/configs/w8a16_fp8kv_varlen.yml deleted file mode 100644 index ddd04ab..0000000 --- a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A16 + FP8 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + BF16 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "varlen" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "bf16" - linear_mlp_act_dtype: "bf16" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w8a16_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a8_bf16kv_static.yml b/diffulex_bench/configs/w8a8_bf16kv_static.yml deleted file mode 100644 index e34456c..0000000 --- a/diffulex_bench/configs/w8a8_bf16kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A8 + BF16 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + INT8 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "static" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w8a8_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml b/diffulex_bench/configs/w8a8_bf16kv_varlen.yml deleted file mode 100644 index 57e919b..0000000 --- a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A8 + BF16 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.5 - max_model_len: 4096 - max_num_batched_tokens: 2048 - max_num_seqs: 64 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + INT8 activations + BF16 KV cache - kv_cache_dtype: "bf16" - decode_mode: "varlen" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w8a8_bf16kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a8_fp8kv_static.yml b/diffulex_bench/configs/w8a8_fp8kv_static.yml deleted file mode 100644 index da5b9c6..0000000 --- a/diffulex_bench/configs/w8a8_fp8kv_static.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A8 + FP8 KV Cache (static mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # CUDA graph not implemented yet for DiffusionLM - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + INT8 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "static" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_static/w8a8_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml b/diffulex_bench/configs/w8a8_fp8kv_varlen.yml deleted file mode 100644 index 1ae985b..0000000 --- a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml +++ /dev/null @@ -1,47 +0,0 @@ -# W8A8 + FP8 KV Cache (varlen mode) -engine: - model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B" - tokenizer_path: null - model_name: "dream" - decoding_strategy: "d2f" - mask_token_id: 151666 - - use_lora: false - lora_path: "" - - tensor_parallel_size: 1 - data_parallel_size: 1 - - gpu_memory_utilization: 0.7 - max_model_len: 4096 - max_num_batched_tokens: 4096 - max_num_seqs: 128 - - enforce_eager: true # Required for varlen mode - kv_cache_layout: "unified" - - accept_threshold: 0.9 - complete_threshold: 0.95 - add_new_block_threshold: 0.1 - diffusion_block_size: 32 - - # Quantization: INT8 weights + INT8 activations + FP8 KV cache - kv_cache_dtype: "fp8_e4m3" - decode_mode: "varlen" - linear_attn_weight_dtype: "int8" - linear_mlp_weight_dtype: "int8" - linear_attn_act_dtype: "int8" - linear_mlp_act_dtype: "int8" - -eval: - dataset_name: "gsm8k" - dataset_split: "test" - dataset_limit: 10 - - temperature: 0.0 - max_tokens: 512 - ignore_eos: false - - output_dir: "benchmark_results_varlen/w8a8_fp8kv" - save_results: true - use_tqdm: true diff --git a/diffulex_legacy/config.py b/diffulex_legacy/config.py index bd4ec71..a5b1dd6 100755 --- a/diffulex_legacy/config.py +++ b/diffulex_legacy/config.py @@ -29,7 +29,7 @@ class Config: # Distributed comm (per tensor-parallel group). When using multiple DP # replicas on one host, assign unique master_port per replica. master_addr: str = "localhost" - master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333")) + master_port: int = 2333 # Shared memory segment name for intra-TP RPC; must be unique per DP group. shm_name: str = "d2f_vllm" # Start device index for this TP group (set by DP launcher). diff --git a/diffulex_legacy/engine/dp_engine.py b/diffulex_legacy/engine/dp_engine.py index 70f8e82..b9da2b7 100755 --- a/diffulex_legacy/engine/dp_engine.py +++ b/diffulex_legacy/engine/dp_engine.py @@ -115,12 +115,10 @@ def __init__(self, model, **kwargs): need_gpus = self.dp_size * cfg.tensor_parallel_size assert len(vis) >= need_gpus, f"Require {need_gpus} GPUs (dp={self.dp_size}, tp={cfg.tensor_parallel_size}), visible {len(vis)}" - # Optional overrides: kwargs['device_ids'] or env D2F_DEVICE_MAP + # Optional overrides: kwargs['device_ids'] override = None if 'device_ids' in kwargs and kwargs['device_ids']: override = list(kwargs['device_ids']) - elif os.environ.get('D2F_DEVICE_MAP'): - override = [int(x) for x in os.environ['D2F_DEVICE_MAP'].split(',') if x.strip() != ''] if override is not None: assert len(override) >= need_gpus, f"device_ids length {len(override)} < required {need_gpus}" # All override devices must be in visible list diff --git a/diffulex_profiler/example.py b/diffulex_profiler/example.py index 8982990..64e07f5 100644 --- a/diffulex_profiler/example.py +++ b/diffulex_profiler/example.py @@ -67,7 +67,16 @@ def example_multiple_sections(): # Profile model loading with profiler.profile("model_loading"): - llm = Diffulex(model_path, model_name="dream", ...) + model_path = "/path/to/your/model" + llm = Diffulex( + model_path, + model_name="dream", + tensor_parallel_size=1, + data_parallel_size=1, + gpu_memory_utilization=0.25, + max_model_len=2048, + decoding_strategy="d2f", + ) # Profile prefill prompts = ["Prompt 1", "Prompt 2"] diff --git a/examples/test_dream_diffulex_gsm8k.py b/examples/test_dream_diffulex_gsm8k.py index de3a2aa..e15d95d 100755 --- a/examples/test_dream_diffulex_gsm8k.py +++ b/examples/test_dream_diffulex_gsm8k.py @@ -64,4 +64,5 @@ "=*=" * 30) for idx, o in enumerate(outputs): print("\n", "=*=" * 30) - print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n----------\n" + o['text']}\n") \ No newline at end of file + resp = prompts[idx] + "\n----------\n" + o["text"] + print(f"[Prompt {idx} Result]\n{resp}\n") \ No newline at end of file diff --git a/examples/test_dream_dvllm_human_eval.py b/examples/test_dream_dvllm_human_eval.py index 2d95f00..9e72be6 100755 --- a/examples/test_dream_dvllm_human_eval.py +++ b/examples/test_dream_dvllm_human_eval.py @@ -84,4 +84,5 @@ def summarize_profiling(csv_path: str) -> dict: "=*=" * 30) for idx, o in enumerate(outputs): print("\n", "=*=" * 30) - print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n----------\n" + o['text']}\n") \ No newline at end of file + resp = prompts[idx] + "\n----------\n" + o["text"] + print(f"[Prompt {idx} Result]\n{resp}\n") \ No newline at end of file diff --git a/examples/test_fastdllmv2_diffulex_gsm8k.py b/examples/test_fastdllmv2_diffulex_gsm8k.py index 02217b2..1fc1860 100755 --- a/examples/test_fastdllmv2_diffulex_gsm8k.py +++ b/examples/test_fastdllmv2_diffulex_gsm8k.py @@ -86,4 +86,5 @@ def summarize_profiling(csv_path: str) -> dict: "=*=" * 30) for idx, o in enumerate(outputs): print("\n", "=*=" * 30) - print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n----------\n" + o['text']}\n") \ No newline at end of file + resp = prompts[idx] + "\n----------\n" + o["text"] + print(f"[Prompt {idx} Result]\n{resp}\n") \ No newline at end of file diff --git a/examples/test_gptq_awq_loading.py b/examples/test_gptq_awq_loading.py index a9a40fa..3cb8eed 100644 --- a/examples/test_gptq_awq_loading.py +++ b/examples/test_gptq_awq_loading.py @@ -25,23 +25,6 @@ except Exception: pass -# 自动设置 CUDA 12.2 路径(如果存在) -_CUDA_12_2_PATH = Path("/home/lzx/cuda-12.2") -if _CUDA_12_2_PATH.exists(): - os.environ["CUDA_HOME"] = str(_CUDA_12_2_PATH) - os.environ["CUDA_PATH"] = str(_CUDA_12_2_PATH) - os.environ["PATH"] = f"{_CUDA_12_2_PATH}/bin:{os.environ.get('PATH', '')}" - os.environ["LD_LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}" - os.environ["LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LIBRARY_PATH', '')}" - os.environ["CPATH"] = f"{_CUDA_12_2_PATH}/include:{os.environ.get('CPATH', '')}" - os.environ["CUDACXX"] = str(_CUDA_12_2_PATH / "bin" / "nvcc") - print(f"[INFO] 已自动设置 CUDA 路径: {_CUDA_12_2_PATH}") - -# 设置使用 GPU1(如果 GPU0 被占用) -if "CUDA_VISIBLE_DEVICES" not in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = "1" - print(f"[INFO] 已设置 CUDA_VISIBLE_DEVICES=1(使用 GPU1)") - # 确保从当前仓库导入 _REPO_ROOT = Path(__file__).resolve().parents[1] if str(_REPO_ROOT) not in sys.path: diff --git a/examples/test_llada_dvllm_human_eval.py b/examples/test_llada_dvllm_human_eval.py index 5e3608f..1fdb723 100755 --- a/examples/test_llada_dvllm_human_eval.py +++ b/examples/test_llada_dvllm_human_eval.py @@ -83,4 +83,5 @@ def summarize_profiling(csv_path: str) -> dict: "=*=" * 30) for idx, o in enumerate(outputs): print("\n", "=*=" * 30) - print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n----------\n" + o['text']}\n") \ No newline at end of file + resp = prompts[idx] + "\n----------\n" + o["text"] + print(f"[Prompt {idx} Result]\n{resp}\n") \ No newline at end of file diff --git a/examples/test_quantization_generation.py b/examples/test_quantization_generation.py index 22aaebc..7ffd26f 100755 --- a/examples/test_quantization_generation.py +++ b/examples/test_quantization_generation.py @@ -82,23 +82,13 @@ except Exception: pass -# 自动设置 CUDA 12.2 路径(如果存在) -_CUDA_12_2_PATH = Path("/home/lzx/cuda-12.2") -if _CUDA_12_2_PATH.exists(): - os.environ["CUDA_HOME"] = str(_CUDA_12_2_PATH) - # Some toolchains probe CUDA_PATH instead of CUDA_HOME. - os.environ["CUDA_PATH"] = str(_CUDA_12_2_PATH) - os.environ["PATH"] = f"{_CUDA_12_2_PATH}/bin:{os.environ.get('PATH', '')}" - os.environ["LD_LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}" - os.environ["LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LIBRARY_PATH', '')}" - os.environ["CPATH"] = f"{_CUDA_12_2_PATH}/include:{os.environ.get('CPATH', '')}" - os.environ["CUDACXX"] = str(_CUDA_12_2_PATH / "bin" / "nvcc") - print(f"[INFO] 已自动设置 CUDA 路径: {_CUDA_12_2_PATH}") - -# 设置使用 GPU1(如果 GPU0 被占用) -if "CUDA_VISIBLE_DEVICES" not in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = "1" - print(f"[INFO] 已设置 CUDA_VISIBLE_DEVICES=1(使用 GPU1)") +# +# NOTE: +# 这个脚本不应假设本机 CUDA 安装路径或默认 GPU 号。 +# 如需指定 CUDA/设备,请在运行前自行设置: +# - CUDA_HOME / CUDA_PATH / PATH / LD_LIBRARY_PATH +# - CUDA_VISIBLE_DEVICES +# 或者在你自己的 wrapper 脚本里处理。 # 确保从当前仓库导入 _REPO_ROOT = Path(__file__).resolve().parents[1] @@ -736,7 +726,9 @@ def main(): # 其他选项 parser.add_argument('--max-tokens', type=int, default=30, help='最大生成 token 数(默认: 30)') - parser.add_argument('--model-path', type=str, help='模型路径(默认: 从环境变量 DIFFULEX_TEST_MODEL 读取)') + parser.add_argument('--model-path', type=str, required=True, help='模型路径(必填)') + parser.add_argument('--lora-path', type=str, default="", help='LoRA 路径(可选)') + parser.add_argument('--use-lora', action='store_true', help='启用 LoRA(需同时提供 --lora-path)') parser.add_argument('--gpu-memory-utilization', type=float, default=0.3, help='GPU 内存利用率(默认: 0.3)') parser.add_argument('--no-isolate', action='store_true', help='多策略运行时不使用子进程隔离(调试用,可能导致状态串扰/性能波动)') # Internal: emit a single JSON result line for parent process parsing. @@ -745,10 +737,10 @@ def main(): args = parser.parse_args() # 确定模型路径 - model_path = args.model_path or os.getenv("DIFFULEX_TEST_MODEL", "/data1/ckpts/Dream-org/Dream-v0-Base-7B") + model_path = args.model_path if not os.path.exists(model_path): print(f"错误: 模型路径不存在: {model_path}") - print("请使用 --model-path 或设置环境变量 DIFFULEX_TEST_MODEL 指向有效的模型路径") + print("请使用 --model-path 指向有效的模型路径") return # 解析要运行的策略 @@ -786,8 +778,8 @@ def main(): # 通用 Diffulex 配置 common_kwargs = { - 'lora_path': os.getenv("DIFFULEX_TEST_LORA", ""), - 'use_lora': bool(os.getenv("DIFFULEX_TEST_LORA", "")), + 'lora_path': args.lora_path, + 'use_lora': bool(args.use_lora and args.lora_path), 'model_name': 'dream', 'enforce_eager': True, 'data_parallel_size': 1, diff --git a/examples/test_sdar_diffulex_gsm8k.py b/examples/test_sdar_diffulex_gsm8k.py index b4f360c..5d9efe7 100755 --- a/examples/test_sdar_diffulex_gsm8k.py +++ b/examples/test_sdar_diffulex_gsm8k.py @@ -64,4 +64,5 @@ "=*=" * 30) for idx, o in enumerate(outputs): print("\n", "=*=" * 30) - print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n----------\n" + o['text']}\n") \ No newline at end of file + resp = prompts[idx] + "\n----------\n" + o["text"] + print(f"[Prompt {idx} Result]\n{resp}\n") \ No newline at end of file diff --git a/examples/test_sdar_dvllm.py b/examples/test_sdar_dvllm.py index 78fbbd7..4c30918 100644 --- a/examples/test_sdar_dvllm.py +++ b/examples/test_sdar_dvllm.py @@ -97,14 +97,14 @@ def main() -> None: parser.add_argument( "--model", type=str, - default="/home/lzx/SDAR/training/model/SDAR-1.7B-Chat", + required=True, help="SDAR HF model directory (contains config.json + model.safetensors).", ) parser.add_argument("--device", type=int, default=0) parser.add_argument( "--converted-dir", type=str, - default="/home/lzx/tmp/diffulex_sdar_converted", + default="tmp/diffulex_sdar_converted", help="Output directory for converted checkpoint keys (Diffulex-native).", ) parser.add_argument("--prompt", type=str, default="你好,请用一句话介绍 SDAR。") diff --git a/profile/torch_d2f_profiler.py b/profile/torch_d2f_profiler.py index e8d36cb..8dfcf18 100644 --- a/profile/torch_d2f_profiler.py +++ b/profile/torch_d2f_profiler.py @@ -40,17 +40,6 @@ except Exception: pass -# Optional: auto CUDA 12.2 toolchain env (align with your other scripts). -_CUDA_12_2_PATH = Path("/home/lzx/cuda-12.2") -if _CUDA_12_2_PATH.exists(): - os.environ.setdefault("CUDA_HOME", str(_CUDA_12_2_PATH)) - os.environ.setdefault("CUDA_PATH", str(_CUDA_12_2_PATH)) - os.environ["PATH"] = f"{_CUDA_12_2_PATH}/bin:{os.environ.get('PATH', '')}" - os.environ["LD_LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}" - os.environ["LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LIBRARY_PATH', '')}" - os.environ["CPATH"] = f"{_CUDA_12_2_PATH}/include:{os.environ.get('CPATH', '')}" - os.environ.setdefault("CUDACXX", str(_CUDA_12_2_PATH / "bin" / "nvcc")) - # Ensure import from current repo. _REPO_ROOT = Path(__file__).resolve().parents[1] if str(_REPO_ROOT) not in sys.path: @@ -89,9 +78,10 @@ def _mkdir(p: Path) -> Path: def main() -> None: parser = argparse.ArgumentParser("Diffulex torch.profiler flamegraph (D2F/Dream)") - parser.add_argument("--model-path", type=str, default=os.getenv("DIFFULEX_TEST_MODEL", "/data1/ckpts/Dream-org/Dream-v0-Base-7B")) - parser.add_argument("--lora-path", type=str, default=os.getenv("DIFFULEX_TEST_LORA", "")) - parser.add_argument("--use-lora", action="store_true", help="启用 LoRA(需同时提供 --lora-path 或 DIFFULEX_TEST_LORA)") + parser.add_argument("--model-path", type=str, required=True, help="模型路径(必填)") + parser.add_argument("--lora-path", type=str, default="", help="LoRA 路径(可选)") + parser.add_argument("--use-lora", action="store_true", help="启用 LoRA(需同时提供 --lora-path)") + parser.add_argument("--cuda-home", type=str, default="", help="(可选)设置 CUDA_HOME/CUDA_PATH 并更新 PATH/LD_LIBRARY_PATH") parser.add_argument("--tag", type=str, default="torch_profile", help="输出文件名前缀") parser.add_argument("--out-dir", type=str, default="log/torch_profiles", help="输出目录(相对仓库根)") @@ -152,6 +142,18 @@ def main() -> None: args = parser.parse_args() + if args.cuda_home: + cuda_home = Path(args.cuda_home) + if not cuda_home.exists(): + raise FileNotFoundError(f"--cuda-home 不存在: {cuda_home}") + os.environ["CUDA_HOME"] = str(cuda_home) + os.environ["CUDA_PATH"] = str(cuda_home) + os.environ["PATH"] = f"{cuda_home}/bin:{os.environ.get('PATH', '')}" + os.environ["LD_LIBRARY_PATH"] = f"{cuda_home}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}" + os.environ["LIBRARY_PATH"] = f"{cuda_home}/lib64:{os.environ.get('LIBRARY_PATH', '')}" + os.environ["CPATH"] = f"{cuda_home}/include:{os.environ.get('CPATH', '')}" + os.environ["CUDACXX"] = str(cuda_home / "bin" / "nvcc") + model_path = Path(args.model_path) if not model_path.exists(): raise FileNotFoundError(f"模型路径不存在: {model_path}") diff --git a/quantization_architecture.md b/quantization_architecture.md deleted file mode 100644 index 8504bf5..0000000 --- a/quantization_architecture.md +++ /dev/null @@ -1,149 +0,0 @@ -# Diffulex 量化模块架构总结 - -## 一、架构概述 - -Diffulex的量化模块采用**策略模式(Strategy Pattern)**和**上下文管理(Context Management)**设计,支持灵活的量化策略扩展。模块主要包含以下组件: - -### 1. 核心组件 - -#### 1.1 配置层 (Config) -- **QuantizationConfig**: 顶级量化配置,包含KV cache、权重、激活的量化配置 -- **KVCacheQuantConfig**: KV cache量化配置(dtype: bf16/fp8_e4m3/fp8_e5m2) -- **WeightQuantConfig**: 权重量化配置(支持按类型区分:attn/mlp) -- **ActivationQuantConfig**: 激活量化配置(支持按类型区分:attn/mlp) - -#### 1.2 上下文管理 (Context) -- **QuantizationContext**: 线程本地存储(Thread-Local Storage),管理量化策略实例 - - 存储策略实例:`kv_cache`, `linear_attn`, `linear_mlp`, `linear_other` - - 提供激活量化缓存(step-local cache) - - 通过全局函数访问:`get_quantization_context()`, `get_kv_cache_strategy()`, `get_linear_strategy()` - -#### 1.3 工厂模式 (Factory) -- **QuantizationStrategyFactory**: 从配置创建量化策略 - - `create_from_config()`: 从Diffulex配置对象创建并配置量化上下文 - - `create_kv_cache_strategy()`: 创建KV cache量化策略 - -#### 1.4 注册表 (Registry) -- **KV Cache策略注册表**: 通过`@register_kv_cache_strategy`装饰器注册 -- **Linear策略注册表**: 通过`@register_linear_strategy`装饰器注册(按weight_dtype + act_dtype配对) -- 支持dtype别名和规范化(如"fp8" -> "fp8_e4m3") - -#### 1.5 策略接口 (Strategy Interfaces) -- **QuantizationStrategy**: 基础抽象类 - - `quantize()`: 量化张量 - - `dequantize()`: 反量化张量 - - `get_storage_dtype()`: 获取存储数据类型 - - `get_scale_shape()`: 获取scale张量形状 - -- **KVCacheQuantizationStrategy**: KV cache量化策略接口 - - `compute_scales()`: 计算量化scale - - `update_scales()`: 更新量化scale(如running max策略) - - `init_scales()`: 初始化scale - - `quantize_kv_for_store()`: 量化KV用于存储 - - `view_kv_cache_for_kernels()`: 为kernel提供视图 - -- **LinearQuantizationStrategy**: Linear层量化策略接口 - - `linear_forward()`: 执行量化Linear前向传播 - - `quantize_weight_for_kernel()`: 为kernel量化权重 - - `quantize_act_for_kernel()`: 为kernel量化激活 - -#### 1.6 具体策略实现 (Strategy Implementations) - -**KV Cache策略**: -- `KVCacheBF16Strategy`: BF16存储(无量化) -- `KVCacheFP8RunningMaxStrategy`: FP8量化(E4M3/E5M2),使用running max管理scale - -**Linear策略**: -- `LinearBF16Strategy`: BF16权重+BF16激活(无量化) -- `LinearGPTQW4A16Strategy`: GPTQ W4权重+BF16激活 -- `LinearAWQW4A16Strategy`: AWQ W4权重+BF16激活 -- `LinearInt8W8A16Strategy`: INT8权重+BF16激活 -- `LinearInt8W8A8Strategy`: INT8权重+INT8激活 -- `LinearInt4W4A16Strategy`: INT4权重+BF16激活 -- `LinearInt4W4A8Strategy`: INT4权重+INT8激活 -- `LinearFP8W8A16Strategy`: FP8权重+BF16激活 -- `LinearFP8W8A8Strategy`: FP8权重+FP8激活 -- `LinearStubStrategy`: 占位策略(未实现的组合) - -#### 1.7 工具函数 (Utilities) -- **kv_cache_dtype.py**: KV cache数据类型处理 - - `parse_kv_cache_dtype()`: 解析dtype字符串 - - `view_fp8_cache()`: FP8 cache视图转换 - - `ensure_scale_tensor()`: 确保scale张量格式正确 - -## 二、与其他模块的耦合关系 - -### 2.1 模型运行器 (Model Runner) -**文件**: `diffulex/engine/model_runner.py` -- **初始化**: 在`ModelRunnerBase.__init__()`中调用`QuantizationStrategyFactory.create_from_config(config)` -- **KV Cache分配**: 使用`get_kv_cache_strategy()`获取策略,根据策略分配KV cache存储 - -### 2.2 Linear层 -**文件**: `diffulex/layer/linear.py` -- **前向传播**: 在`forward()`中调用`get_linear_strategy(quant_kind)`获取策略 -- **权重量化**: 在`_maybe_quantize_loaded_weight_param()`中,加载权重后自动量化并删除BF16权重参数 -- **离线量化支持**: 支持GPTQ/AWQ离线量化权重的加载和使用 - -### 2.3 KV Cache Kernels -**文件**: `diffulex_kernel/python/kv_cache_kernels.py`, `diffulex_kernel/python/dllm_flash_attn_kernels.py` -- **策略获取**: 在kernel函数中调用`get_kv_cache_strategy()`获取策略 -- **Scale管理**: 使用策略的`update_scales()`更新scale -- **Cache视图**: 使用策略的`view_kv_cache_for_kernels()`获取适合kernel的视图 - -### 2.4 注意力实现 -**文件**: `diffulex/attention/attn_impl.py` -- **策略获取**: 在注意力计算中获取KV cache策略 -- **Scale传递**: 将scale传递给attention metadata - -### 2.5 TP Worker -**文件**: `diffulex/engine/tp_worker.py` -- **缓存清理**: 在每个step开始时调用`clear_act_quant_cache()`清理激活量化缓存 - -## 三、量化流程 - -### 3.1 初始化流程 -1. `ModelRunnerBase.__init__()` 调用 `QuantizationStrategyFactory.create_from_config(config)` -2. Factory从config解析`QuantizationConfig` -3. Factory创建KV cache策略和Linear策略(按attn/mlp/other分类) -4. 策略注册到`QuantizationContext`(线程本地存储) - -### 3.2 KV Cache量化流程 -1. **初始化**: 调用`strategy.init_scales()`初始化scale张量 -2. **存储**: 在KV cache存储时,调用`strategy.quantize_kv_for_store()`量化K和V -3. **更新**: 每次前向传播后,调用`strategy.update_scales()`更新running max scale -4. **使用**: Kernel使用`strategy.view_kv_cache_for_kernels()`获取适合的视图 - -### 3.3 Linear量化流程 -1. **权重量化**: - - 在线量化:加载权重时自动调用`strategy.quantize_weight_for_kernel()` - - 离线量化:通过`set_offline_quantized_weight()`加载GPTQ/AWQ权重 -2. **前向传播**: - - 调用`strategy.linear_forward()`执行量化计算 - - 支持TileLang kernel加速(如GPTQ W4A16) - - 支持Python fallback实现 - -### 3.4 激活量化流程(W8A8/W4A8) -1. **缓存**: 使用`QuantizationContext`的step-local cache缓存激活量化结果 -2. **量化**: 在Linear层前向传播时,调用`strategy.quantize_act_for_kernel()` -3. **清理**: 每个step开始时清理缓存 - -## 四、扩展性设计 - -### 4.1 添加新的KV Cache策略 -1. 实现`KVCacheQuantizationStrategy`接口 -2. 使用`@register_kv_cache_strategy("dtype_alias")`注册 -3. 在`strategies/__init__.py`中导入(触发注册) - -### 4.2 添加新的Linear策略 -1. 实现`LinearQuantizationStrategy`接口 -2. 使用`@register_linear_strategy(weight_dtype="...", act_dtype="...")`注册 -3. 在`strategies/__init__.py`中导入(触发注册) - -### 4.3 支持新的量化方法 -- 权重量化:GPTQ, AWQ, INT8, INT4, FP8 -- 激活量化:INT8, INT4, FP8 -- KV Cache量化:FP8 (E4M3/E5M2) - -## 五、架构图 - -详见下面的Mermaid图表。 diff --git a/quantization_architecture_diagram.md b/quantization_architecture_diagram.md deleted file mode 100644 index 5d38fea..0000000 --- a/quantization_architecture_diagram.md +++ /dev/null @@ -1,551 +0,0 @@ -# Diffulex 量化模块架构图 - -## 完整架构图 - -```mermaid -graph TB - subgraph "用户配置层" - Config[Diffulex Config
kv_cache_dtype
linear_attn_weight_dtype
linear_mlp_weight_dtype
...] - end - - subgraph "量化模块核心" - subgraph "配置解析" - QC[QuantizationConfig] - KVC[KVCacheQuantConfig] - WC[WeightQuantConfig] - AC[ActivationQuantConfig] - Config --> QC - QC --> KVC - QC --> WC - QC --> AC - end - - subgraph "工厂与注册表" - Factory[QuantizationStrategyFactory
create_from_config
create_kv_cache_strategy] - RegKV[KV Cache Registry
@register_kv_cache_strategy] - RegLinear[Linear Registry
@register_linear_strategy] - Factory --> RegKV - Factory --> RegLinear - end - - subgraph "上下文管理" - Context[QuantizationContext
Thread-Local Storage] - Context --> |存储| KVStrategy[KV Cache Strategy] - Context --> |存储| LinearAttn[Linear Attn Strategy] - Context --> |存储| LinearMLP[Linear MLP Strategy] - Context --> |存储| LinearOther[Linear Other Strategy] - Context --> |缓存| ActCache[Activation Quant Cache
Step-Local] - end - - subgraph "策略接口层" - BaseStrategy[QuantizationStrategy
quantize/dequantize
get_storage_dtype] - KVInterface[KVCacheQuantizationStrategy
compute_scales
update_scales
quantize_kv_for_store] - LinearInterface[LinearQuantizationStrategy
linear_forward
quantize_weight_for_kernel
quantize_act_for_kernel] - BaseStrategy --> KVInterface - BaseStrategy --> LinearInterface - end - - subgraph "KV Cache策略实现" - KVBF16[KVCacheBF16Strategy
BF16存储] - KVFP8[KVCacheFP8RunningMaxStrategy
FP8 E4M3/E5M2
Running Max Scale] - KVInterface --> KVBF16 - KVInterface --> KVFP8 - end - - subgraph "Linear策略实现" - LBF16[LinearBF16Strategy
BF16/BF16] - LGPTQ[LinearGPTQW4A16Strategy
GPTQ W4/BF16] - LAWQ[LinearAWQW4A16Strategy
AWQ W4/BF16] - LInt8W8A16[LinearInt8W8A16Strategy
INT8/BF16] - LInt8W8A8[LinearInt8W8A8Strategy
INT8/INT8] - LInt4W4A16[LinearInt4W4A16Strategy
INT4/BF16] - LInt4W4A8[LinearInt4W4A8Strategy
INT4/INT8] - LFP8W8A16[LinearFP8W8A16Strategy
FP8/BF16] - LFP8W8A8[LinearFP8W8A8Strategy
FP8/FP8] - LinearInterface --> LBF16 - LinearInterface --> LGPTQ - LinearInterface --> LAWQ - LinearInterface --> LInt8W8A16 - LinearInterface --> LInt8W8A8 - LinearInterface --> LInt4W4A16 - LinearInterface --> LInt4W4A8 - LinearInterface --> LFP8W8A16 - LinearInterface --> LFP8W8A8 - end - - subgraph "工具函数" - KVDType[kv_cache_dtype.py
parse_kv_cache_dtype
view_fp8_cache
ensure_scale_tensor] - end - end - - subgraph "运行时模块" - subgraph "模型运行器" - MR[ModelRunnerBase
__init__] - MR --> |初始化| Factory - MR --> |获取| Context - end - - subgraph "Linear层" - Linear[LinearBase
ReplicatedLinear
ColumnParallelLinear
RowParallelLinear] - Linear --> |forward| Context - Linear --> |quantize_weight| Context - end - - subgraph "KV Cache Kernels" - KVKernel[kv_cache_kernels.py
dllm_flash_attn_kernels.py] - KVKernel --> |获取策略| Context - KVKernel --> |更新scale| KVStrategy - end - - subgraph "注意力实现" - Attn[attn_impl.py] - Attn --> |获取策略| Context - end - - subgraph "TP Worker" - TP[tp_worker.py] - TP --> |清理缓存| Context - end - end - - subgraph "离线量化工具" - Offline[quantize_model.py
GPTQ/AWQ离线量化] - end - - %% 连接关系 - QC --> Factory - Factory --> Context - RegKV --> KVBF16 - RegKV --> KVFP8 - RegLinear --> LBF16 - RegLinear --> LGPTQ - RegLinear --> LAWQ - RegLinear --> LInt8W8A16 - RegLinear --> LInt8W8A8 - RegLinear --> LInt4W4A16 - RegLinear --> LInt4W4A8 - RegLinear --> LFP8W8A16 - RegLinear --> LFP8W8A8 - KVStrategy --> KVInterface - LinearAttn --> LinearInterface - LinearMLP --> LinearInterface - LinearOther --> LinearInterface - KVDType --> KVFP8 - - style Config fill:#e1f5ff - style QC fill:#fff4e1 - style Factory fill:#fff4e1 - style Context fill:#e8f5e9 - style KVInterface fill:#f3e5f5 - style LinearInterface fill:#f3e5f5 - style KVBF16 fill:#fff9c4 - style KVFP8 fill:#fff9c4 - style LGPTQ fill:#fff9c4 - style LAWQ fill:#fff9c4 - style MR fill:#ffebee - style Linear fill:#ffebee - style KVKernel fill:#ffebee -``` - -## 数据流图 - -```mermaid -sequenceDiagram - participant Config as Diffulex Config - participant Factory as QuantizationStrategyFactory - participant Context as QuantizationContext - participant KVStrategy as KV Cache Strategy - participant LinearStrategy as Linear Strategy - participant ModelRunner as ModelRunner - participant LinearLayer as Linear Layer - participant KVKernel as KV Cache Kernel - - Note over Config,KVKernel: 初始化阶段 - Config->>Factory: create_from_config(config) - Factory->>Context: 创建并配置上下文 - Factory->>KVStrategy: 创建KV cache策略 - Factory->>LinearStrategy: 创建Linear策略(attn/mlp/other) - Context->>Context: 存储策略实例 - - Note over ModelRunner,KVKernel: 运行时阶段 - ModelRunner->>Context: get_kv_cache_strategy() - Context->>KVStrategy: 返回策略实例 - ModelRunner->>KVStrategy: init_scales() - KVStrategy->>KVStrategy: 初始化scale张量 - - LinearLayer->>Context: get_linear_strategy(quant_kind) - Context->>LinearStrategy: 返回策略实例 - LinearLayer->>LinearStrategy: linear_forward(x, weight, bias) - LinearStrategy->>LinearStrategy: 执行量化计算 - - KVKernel->>Context: get_kv_cache_strategy() - Context->>KVStrategy: 返回策略实例 - KVKernel->>KVStrategy: update_scales(k, v, k_scale, v_scale) - KVStrategy->>KVStrategy: 更新running max scale - KVKernel->>KVStrategy: quantize_kv_for_store(k, v, scales) - KVStrategy->>KVKernel: 返回量化后的K和V -``` - -## 策略选择流程图 - -```mermaid -flowchart TD - Start[开始] --> LoadConfig[加载Diffulex Config] - LoadConfig --> ParseConfig[解析QuantizationConfig] - ParseConfig --> CheckKVCache{检查kv_cache_dtype} - - CheckKVCache -->|bf16/fp16/fp32| CreateKVBF16[创建KVCacheBF16Strategy] - CheckKVCache -->|fp8/fp8_e4m3| CreateKVFP8E4M3[创建KVCacheFP8RunningMaxStrategy
E4M3] - CheckKVCache -->|fp8_e5m2| CreateKVFP8E5M2[创建KVCacheFP8RunningMaxStrategy
E5M2] - - ParseConfig --> CheckLinearAttn{检查linear_attn配置} - CheckLinearAttn -->|weight_dtype + act_dtype| CreateLinearAttn[创建Linear策略
注册到linear_attn] - - ParseConfig --> CheckLinearMLP{检查linear_mlp配置} - CheckLinearMLP -->|weight_dtype + act_dtype| CreateLinearMLP[创建Linear策略
注册到linear_mlp] - - CreateKVBF16 --> RegisterContext[注册到QuantizationContext] - CreateKVFP8E4M3 --> RegisterContext - CreateKVFP8E5M2 --> RegisterContext - CreateLinearAttn --> RegisterContext - CreateLinearMLP --> RegisterContext - - RegisterContext --> End[完成初始化] - - style CheckKVCache fill:#e1f5ff - style CheckLinearAttn fill:#e1f5ff - style CheckLinearMLP fill:#e1f5ff - style RegisterContext fill:#e8f5e9 -``` - -## Linear量化决策流程图 - -```mermaid -flowchart TD - Start[Linear.forward调用] --> GetStrategy[get_linear_strategy
quant_kind] - GetStrategy --> CheckOffline{检查离线量化权重
GPTQ/AWQ} - - CheckOffline -->|有GPTQ权重| UseGPTQ[使用GPTQ策略
linear_forward
传递qweight/qzeros/scales] - CheckOffline -->|有AWQ权重| UseAWQ[使用AWQ策略
linear_forward
传递qweight/qzeros/scales] - CheckOffline -->|无离线量化| CheckOnline{检查在线量化权重
int8/int4/fp8} - - CheckOnline -->|有量化权重| UseOnline[使用量化策略
linear_forward
传递quant_weight_int8/scales] - CheckOnline -->|无量化权重| CheckStrategy{检查策略} - - CheckStrategy -->|有策略| UseStrategy[使用策略
linear_forward
传递bf16 weight] - CheckStrategy -->|无策略| UseDefault[使用默认F.linear
bf16 weight] - - UseGPTQ --> TryKernel{尝试TileLang Kernel} - TryKernel -->|成功| KernelResult[Kernel计算结果] - TryKernel -->|失败| PythonFallback[Python Fallback
dequantize + F.linear] - - UseAWQ --> TryKernel - UseOnline --> KernelOrPython[Kernel或Python实现] - UseStrategy --> KernelOrPython - UseDefault --> Result[返回结果] - - KernelResult --> Result - PythonFallback --> Result - KernelOrPython --> Result - - style CheckOffline fill:#e1f5ff - style CheckOnline fill:#e1f5ff - style CheckStrategy fill:#e1f5ff - style TryKernel fill:#fff9c4 -``` - -## KV Cache量化流程图 - -### 完整KV Cache量化流程(包含Store和Load) - -```mermaid -flowchart TB - subgraph "Store阶段" - Start[KV Cache Store] --> GetStrategy1[get_kv_cache_strategy] - GetStrategy1 --> CheckFormat1{检查kv_cache_format} - - CheckFormat1 -->|bf16| BF16Store[BF16 Store路径] - CheckFormat1 -->|fp8| FP8Store[FP8 Store路径] - - BF16Store --> StoreBF16[直接存储为BF16
dtype: bfloat16
无需量化] - - FP8Store --> UpdateScales["update_scales
更新running max scale
k_scale/v_scale: float32
shape: (num_kv_heads)"] - UpdateScales --> QuantizeKV["quantize_kv_for_store
K/V: bfloat16 -> uint8
使用k_scale/v_scale量化"] - QuantizeKV --> StoreFP8["存储为uint8
dtype: uint8
FP8格式"] - - StoreBF16 --> CheckLayout1{检查Layout} - StoreFP8 --> CheckLayout1 - - CheckLayout1 -->|unified| StoreUnified["store_kvcache_unified_layout
shape: (num_blocks, page_size, num_kv_heads, head_dim)"] - CheckLayout1 -->|distinct| StoreDistinct["store_kvcache_distinct_layout
k_cache: (num_blks, h, hdim//x, blk_sz, x)
v_cache: (num_blks, h, hdim, blk_sz)"] - end - - subgraph "Load阶段" - LoadStart[KV Cache Load] --> GetStrategy2[get_kv_cache_strategy] - GetStrategy2 --> CheckFormat2{检查kv_cache_format} - - CheckFormat2 -->|bf16| BF16Load[BF16 Load路径] - CheckFormat2 -->|fp8| FP8Load[FP8 Load路径] - - BF16Load --> CheckLayout2{检查Layout} - FP8Load --> CheckLayout2 - - CheckLayout2 -->|unified| UnifiedLoad[Unified Layout Load] - CheckLayout2 -->|distinct| DistinctLoad[Distinct Layout Load
总是使用varlen路径] - - UnifiedLoad --> CheckDecodeMode{检查decode_mode} - CheckDecodeMode -->|static| StaticPath[Static模式
TileLang Kernel] - CheckDecodeMode -->|varlen| VarlenPath[Varlen模式
load_kvcache + flash_attn_varlen_func] - - DistinctLoad --> VarlenPath - - StaticPath --> StaticBF16{BF16?} - StaticPath --> StaticFP8{FP8?} - - StaticBF16 --> TileLangBF16[dllm_flash_attn_decode_kernel
TileLang Kernel
输入: q/k/v/cache bfloat16
输出: bfloat16] - - StaticFP8 --> ViewFP8Cache[strategy.view_kv_cache_for_kernels
uint8 -> float8 view
dtype转换] - ViewFP8Cache --> TileLangFP8[dllm_flash_attn_decode_kernel_bf16_q_fp8_kv
TileLang Kernel
输入: q bfloat16, cache float8
k_scale/v_scale float32
kernel内反量化+scale
输出: bfloat16] - - VarlenPath --> LoadKVCache[load_kvcache函数] - LoadKVCache --> LoadBF16{BF16?} - LoadKVCache --> LoadFP8{FP8?} - - LoadBF16 --> LoadBF16Kernel[_load_kvcache_bf16
Triton Kernel
gather cache blocks
输出: bfloat16] - - LoadFP8 --> LoadFP8Kernel[_load_kvcache_fp8
Triton Fused Kernel
gather + dequant + scale
输入: cache uint8/float8 view
k_scale/v_scale float32
输出: bfloat16] - - LoadBF16Kernel --> FlashAttnBF16[flash_attn_varlen_func
输入: q/k_comb/v_comb bfloat16
输出: bfloat16] - LoadFP8Kernel --> FlashAttnFP8[flash_attn_varlen_func
输入: q/k_comb/v_comb bfloat16
输出: bfloat16] - end - - StoreUnified --> LoadStart - StoreDistinct --> LoadStart - TileLangBF16 --> End[完成] - TileLangFP8 --> End - FlashAttnBF16 --> End - FlashAttnFP8 --> End - - style CheckFormat1 fill:#e1f5ff - style CheckFormat2 fill:#e1f5ff - style CheckLayout1 fill:#fff9c4 - style CheckLayout2 fill:#fff9c4 - style CheckDecodeMode fill:#fff9c4 - style QuantizeKV fill:#ffebee - style ViewFP8Cache fill:#ffebee - style StaticPath fill:#e8f5e9 - style VarlenPath fill:#e8f5e9 -``` - -### 数据类型传递详细图 - -```mermaid -sequenceDiagram - participant AttnImpl as Attention Implementation - participant Strategy as KV Cache Strategy - participant StoreKernel as Store Kernel - participant Cache as KV Cache Storage - participant LoadKernel as Load Kernel - participant DecodeKernel as Decode Kernel - participant FlashAttn as flash_attn_varlen_func - - Note over AttnImpl,FlashAttn: BF16路径 (Unified Layout, Static Mode) - AttnImpl->>Strategy: get_kv_cache_strategy() - Strategy-->>AttnImpl: KVCacheBF16Strategy - AttnImpl->>AttnImpl: k: (N, H, D) bfloat16
v: (N, H, D) bfloat16 - AttnImpl->>StoreKernel: store_kvcache_unified_layout
k, v, cache, slot_mapping - StoreKernel->>Cache: 直接存储
dtype: bfloat16
shape: (num_blocks, page_size, H, D) - AttnImpl->>DecodeKernel: dllm_flash_attn_decode
q: bfloat16
k_cache: bfloat16
v_cache: bfloat16 - DecodeKernel->>DecodeKernel: TileLang Kernel
内部gather + attention计算 - DecodeKernel-->>AttnImpl: output: bfloat16 - - Note over AttnImpl,FlashAttn: FP8路径 (Unified Layout, Static Mode) - AttnImpl->>Strategy: get_kv_cache_strategy() - Strategy-->>AttnImpl: KVCacheFP8RunningMaxStrategy - AttnImpl->>AttnImpl: k: (N, H, D) bfloat16
v: (N, H, D) bfloat16 - AttnImpl->>Strategy: update_scales(k, v, k_scale, v_scale) - Strategy-->>AttnImpl: k_scale: (H) float32
v_scale: (H) float32 - AttnImpl->>Strategy: quantize_kv_for_store(k, v, k_scale, v_scale) - Strategy->>Strategy: 量化: k/v bfloat16 -> uint8
使用scale进行量化 - Strategy-->>AttnImpl: k_q: (N, H, D) uint8
v_q: (N, H, D) uint8 - AttnImpl->>StoreKernel: store_kvcache_unified_layout
k_q, v_q (uint8) - StoreKernel->>Cache: 存储为uint8
dtype: uint8
shape: (num_blocks, page_size, H, D) - AttnImpl->>Strategy: view_kv_cache_for_kernels(cache) - Strategy->>Strategy: uint8 -> float8 view
dtype转换(不改变存储) - Strategy-->>AttnImpl: cache_fp8: float8 view - AttnImpl->>DecodeKernel: dllm_flash_attn_decode_bf16_q_fp8_kv
q: bfloat16
k_cache: float8 view
v_cache: float8 view
k_scale: (H) float32
v_scale: (H) float32 - DecodeKernel->>DecodeKernel: TileLang Kernel
内部: gather + dequant + scale + attention
float8 -> bfloat16 (反量化) - DecodeKernel-->>AttnImpl: output: bfloat16 - - Note over AttnImpl,FlashAttn: FP8路径 (Unified/Distinct Layout, Varlen Mode) - AttnImpl->>Strategy: get_kv_cache_strategy() - Strategy-->>AttnImpl: KVCacheFP8RunningMaxStrategy - AttnImpl->>Strategy: update_scales(k, v, k_scale, v_scale) - Strategy-->>AttnImpl: k_scale: (H) float32
v_scale: (H) float32 - AttnImpl->>Strategy: quantize_kv_for_store(k, v, k_scale, v_scale) - Strategy-->>AttnImpl: k_q: (N, H, D) uint8
v_q: (N, H, D) uint8 - AttnImpl->>StoreKernel: store_kvcache_*_layout
k_q, v_q (uint8) - StoreKernel->>Cache: 存储为uint8
dtype: uint8 - AttnImpl->>LoadKernel: load_kvcache(cache, metadata, k_new, v_new) - LoadKernel->>Strategy: view_kv_cache_for_kernels(cache) - Strategy-->>LoadKernel: cache_fp8: float8 view - LoadKernel->>LoadKernel: Triton Fused Kernel
load_kvcache_kernel_fp8_*
输入: cache float8 view
k_scale/v_scale float32
操作: gather + dequant + scale
输出: k_comb/v_comb bfloat16 - LoadKernel-->>AttnImpl: k_comb: (total_len, H, D) bfloat16
v_comb: (total_len, H, D) bfloat16 - AttnImpl->>FlashAttn: flash_attn_varlen_func
q: bfloat16
k_comb: bfloat16
v_comb: bfloat16 - FlashAttn-->>AttnImpl: output: bfloat16 -``` - -### Layout和Decode模式决策树 - -```mermaid -flowchart TD - Start[KV Cache操作] --> CheckLayout{检查kv_cache_layout} - - CheckLayout -->|unified| UnifiedPath["Unified Layout
shape: (num_blocks, page_size, H, D)"] - CheckLayout -->|distinct| DistinctPath["Distinct Layout
k: (num_blks, h, hdim//x, blk_sz, x)
v: (num_blks, h, hdim, blk_sz)"] - - UnifiedPath --> CheckDecodeMode{检查decode_mode} - CheckDecodeMode -->|static| UnifiedStatic[Static模式
TileLang Kernel] - CheckDecodeMode -->|varlen| UnifiedVarlen[Varlen模式
load_kvcache + flash_attn_varlen_func] - - DistinctPath --> DistinctVarlen[总是Varlen模式
load_kvcache + flash_attn_varlen_func] - - UnifiedStatic --> CheckQuant1{量化格式?} - CheckQuant1 -->|bf16| StaticBF16[TileLang BF16 Kernel
dllm_flash_attn_decode_kernel
输入/输出: bfloat16] - CheckQuant1 -->|fp8| StaticFP8[TileLang FP8 Kernel
dllm_flash_attn_decode_kernel_bf16_q_fp8_kv
输入: q bfloat16, cache float8
scale: float32
输出: bfloat16] - - UnifiedVarlen --> CheckQuant2{量化格式?} - DistinctVarlen --> CheckQuant2 - - CheckQuant2 -->|bf16| VarlenBF16[load_kvcache_bf16
Triton gather kernel
输出: bfloat16
+ flash_attn_varlen_func] - CheckQuant2 -->|fp8| VarlenFP8[load_kvcache_fp8
Triton fused kernel
gather + dequant + scale
输入: cache float8, scale float32
输出: bfloat16
+ flash_attn_varlen_func] - - StaticBF16 --> End[完成] - StaticFP8 --> End - VarlenBF16 --> End - VarlenFP8 --> End - - style CheckLayout fill:#e1f5ff - style CheckDecodeMode fill:#e1f5ff - style CheckQuant1 fill:#fff9c4 - style CheckQuant2 fill:#fff9c4 - style UnifiedStatic fill:#e8f5e9 - style UnifiedVarlen fill:#e8f5e9 - style DistinctVarlen fill:#e8f5e9 - style StaticFP8 fill:#ffebee - style VarlenFP8 fill:#ffebee -``` - -### 详细数据流图:Unified Layout Static模式(FP8) - -```mermaid -flowchart LR - subgraph "Store阶段" - K1["K: bfloat16
(N, H, D)"] --> UpdateScale["update_scales
计算/更新scale"] - V1["V: bfloat16
(N, H, D)"] --> UpdateScale - UpdateScale --> KScale["k_scale: float32
(H)"] - UpdateScale --> VScale["v_scale: float32
(H)"] - K1 --> Quantize["quantize_kv_for_store
使用scale量化"] - V1 --> Quantize - KScale --> Quantize - VScale --> Quantize - Quantize --> KQ["K_q: uint8
(N, H, D)"] - Quantize --> VQ["V_q: uint8
(N, H, D)"] - KQ --> Store["store_kvcache_unified_layout
Triton Kernel"] - VQ --> Store - Store --> Cache["Cache: uint8
(num_blocks, page_size, H, D)"] - end - - subgraph "Load阶段 - Static模式" - Cache --> View["view_kv_cache_for_kernels
uint8 -> float8 view"] - View --> CacheFP8["Cache: float8 view
(num_blocks, page_size, H, D)"] - Q["Q: bfloat16
(num_seqs, num_heads, D)"] --> DecodeKernel - CacheFP8 --> DecodeKernel["dllm_flash_attn_decode_kernel_bf16_q_fp8_kv
TileLang Kernel"] - KScale --> DecodeKernel - VScale --> DecodeKernel - DecodeKernel --> Output["Output: bfloat16
(num_seqs, num_heads, D)"] - end - - style UpdateScale fill:#fff9c4 - style Quantize fill:#ffebee - style View fill:#ffebee - style DecodeKernel fill:#e8f5e9 -``` - -### 详细数据流图:Varlen模式(FP8,Unified/Distinct Layout) - -```mermaid -flowchart LR - subgraph "Store阶段" - K1["K: bfloat16
(N, H, D)"] --> UpdateScale["update_scales
计算/更新scale"] - V1["V: bfloat16
(N, H, D)"] --> UpdateScale - UpdateScale --> KScale["k_scale: float32
(H)"] - UpdateScale --> VScale["v_scale: float32
(H)"] - K1 --> Quantize["quantize_kv_for_store
使用scale量化"] - V1 --> Quantize - KScale --> Quantize - VScale --> Quantize - Quantize --> KQ["K_q: uint8
(N, H, D)"] - Quantize --> VQ["V_q: uint8
(N, H, D)"] - KQ --> Store{Layout?} - VQ --> Store - Store -->|unified| StoreUnified["store_kvcache_unified_layout"] - Store -->|distinct| StoreDistinct["store_kvcache_distinct_layout"] - StoreUnified --> CacheU["Cache: uint8
Unified: (num_blocks, page_size, H, D)"] - StoreDistinct --> CacheD["Cache: uint8
Distinct: k (num_blks, h, hdim//x, blk_sz, x)
v (num_blks, h, hdim, blk_sz)"] - end - - subgraph "Load阶段 - Varlen模式" - CacheU --> LoadKernel - CacheD --> LoadKernel["load_kvcache
Triton Fused Kernel"] - KNew["K_new: bfloat16
(N_new, H, D)"] --> LoadKernel - VNew["V_new: bfloat16
(N_new, H, D)"] --> LoadKernel - KScale --> LoadKernel - VScale --> LoadKernel - Metadata["attn_metadata
block_tables, cu_seqlens, etc."] --> LoadKernel - LoadKernel --> View["view_kv_cache_for_kernels
uint8 -> float8 view"] - View --> GatherDequant["load_kvcache_kernel_fp8_*
gather + dequant + scale
float8 -> bfloat16"] - GatherDequant --> KComb["K_comb: bfloat16
(total_len, H, D)"] - GatherDequant --> VComb["V_comb: bfloat16
(total_len, H, D)"] - Q["Q: bfloat16
(total_len, num_heads, D)"] --> FlashAttn - KComb --> FlashAttn["flash_attn_varlen_func
Flash Attention"] - VComb --> FlashAttn - FlashAttn --> Output["Output: bfloat16
(total_len, num_heads, D)"] - end - - style UpdateScale fill:#fff9c4 - style Quantize fill:#ffebee - style View fill:#ffebee - style GatherDequant fill:#ffebee - style FlashAttn fill:#e8f5e9 -``` - -### 关键数据类型转换总结表 - -| 阶段 | 操作 | 输入类型 | 输出类型 | 说明 | -|------|------|---------|---------|------| -| **Store (BF16)** | 直接存储 | `bfloat16 [N, H, D]` | `bfloat16 [num_blocks, page_size, H, D]` | 无需量化,直接存储 | -| **Store (FP8)** | quantize_kv_for_store | `bfloat16 [N, H, D]` + `float32 [H]` scale | `uint8 [N, H, D]` | 量化并存储为uint8 | -| **Store (FP8)** | 存储到cache | `uint8 [N, H, D]` | `uint8 [num_blocks, page_size, H, D]` | 存储为uint8格式 | -| **Load (Static FP8)** | view_kv_cache_for_kernels | `uint8 [num_blocks, page_size, H, D]` | `float8 view [num_blocks, page_size, H, D]` | 视图转换,不改变存储 | -| **Load (Static FP8)** | TileLang Kernel | `float8 view` + `float32 [H]` scale | `bfloat16 [num_seqs, num_heads, D]` | Kernel内反量化+scale | -| **Load (Varlen FP8)** | view_kv_cache_for_kernels | `uint8 [num_blocks, page_size, H, D]` | `float8 view [num_blocks, page_size, H, D]` | 视图转换 | -| **Load (Varlen FP8)** | Triton Fused Kernel | `float8 view` + `float32 [H]` scale | `bfloat16 [total_len, H, D]` | gather + dequant + scale | -| **Attention** | flash_attn_varlen_func | `bfloat16 [total_len, num_heads, D]` | `bfloat16 [total_len, num_heads, D]` | Flash Attention计算 | - -### 路径选择决策表 - -| Layout | Decode Mode | 量化格式 | Store Kernel | Load Kernel | Attention Kernel | -|--------|-------------|---------|--------------|-------------|------------------| -| Unified | static | bf16 | `store_kvcache_unified_layout` → BF16 kernel | 无(直接使用cache) | `dllm_flash_attn_decode_kernel` (TileLang) | -| Unified | static | fp8 | `store_kvcache_unified_layout` → FP8 kernel | `view_kv_cache_for_kernels` | `dllm_flash_attn_decode_kernel_bf16_q_fp8_kv` (TileLang) | -| Unified | varlen | bf16 | `store_kvcache_unified_layout` → BF16 kernel | `load_kvcache_bf16` (Triton) | `flash_attn_varlen_func` | -| Unified | varlen | fp8 | `store_kvcache_unified_layout` → FP8 kernel | `load_kvcache_fp8` (Triton fused) | `flash_attn_varlen_func` | -| Distinct | varlen | bf16 | `store_kvcache_distinct_layout` → BF16 kernel | `load_kvcache_bf16` (Triton) | `flash_attn_varlen_func` | -| Distinct | varlen | fp8 | `store_kvcache_distinct_layout` → FP8 kernel | `load_kvcache_fp8` (Triton fused) | `flash_attn_varlen_func` | - -**注意**: -- Distinct layout **总是**使用varlen模式(因为K的split layout不适合static模式) -- Static模式**仅支持**Unified layout -- FP8量化在static模式下,反量化在TileLang kernel内部完成 -- FP8量化在varlen模式下,反量化在`load_kvcache`的Triton fused kernel中完成